From 5471d16278b85757d37c1566c83919440e7189be Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Mon, 28 Feb 2022 19:24:23 +0800
Subject: [PATCH 001/272] fix where api doc (#39980)

---
 python/paddle/tensor/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index ecf70ffe4a1..0ba47d79050 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -542,7 +542,7 @@ def where(condition, x=None, y=None, name=None):
 
 
     Args:
-        condition(Tensor): The condition to choose x or y.
+        condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y.
         x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
         y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
 
-- 
GitLab


From 496776367781aba1e4eea190da75a8c339aec43d Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Mon, 28 Feb 2022 22:38:07 +0800
Subject: [PATCH 002/272] [custom kernel] change kernel name judgement and
 remove macro control for selected_row (#39977)

---
 paddle/phi/core/custom_kernel.cc            | 8 ++++----
 paddle/phi/core/kernel_registry.h           | 4 ----
 paddle/phi/core/kernel_utils.h              | 6 ------
 paddle/phi/core/tensor_meta.h               | 7 -------
 paddle/phi/tests/core/test_custom_kernel.cc | 8 +++-----
 5 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index f84a2bd8d9c..58f9e1c623e 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
   auto& kernel_info_map = custom_kernel_map.GetMap();
   VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
 
+  auto& kernels = KernelFactory::Instance().kernels();
   for (auto& pair : kernel_info_map) {
-    PADDLE_ENFORCE_EQ(
-        KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
-        true,
+    PADDLE_ENFORCE_NE(
+        kernels.find(pair.first),
+        kernels.end(),
         phi::errors::InvalidArgument(
             "The kernel %s is not ready for custom kernel registering.",
             pair.first));
 
     for (auto& info_pair : pair.second) {
-      auto& kernels = KernelFactory::Instance().kernels();
       PADDLE_ENFORCE_EQ(
           kernels[pair.first].find(info_pair.first),
           kernels[pair.first].end(),
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 6a1688947b9..7a05452cbeb 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#endif
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
-#endif
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 2fda3cb6db4..e5de5e2b49e 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -23,9 +23,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
-#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -222,9 +220,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
-#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -259,9 +255,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
-#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index 3d2da542c74..f4bd0be0b45 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/utils/any.h"
 #include "paddle/utils/optional.h"
 
-// Note: mixed_vector include many header now, LoD will be
-// used on CUDA device? Can we use small_vector here?
-// @zhanlve: Rollback to original LoD for now
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-#include "paddle/fluid/framework/mixed_vector.h"
-#endif
-
 namespace phi {
 
 using DDim = phi::DDim;
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index d8e42c9d0d8..69922c055cb 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) {
               custom_fake_dot_kernels.end());
 
   // 3.before register
-  auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
-  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name));
+  EXPECT_TRUE(kernels.find(op_name) == kernels.end());
 
-  // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while
-  // registering
+  // mock fake_dot is supported by phi for check while registering
   auto& fake_dot_kernels = kernels[op_name];
 
   EXPECT_TRUE(fake_dot_kernels.find(
@@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // 4.kernel select
-  auto kernel = kernel_factory_instance.SelectKernelOrThrowError(
+  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8));
 
   // 5.prepare parameters for kernel
-- 
GitLab


From 1b585b2896c05f08a69c6513ba16fd6817739118 Mon Sep 17 00:00:00 2001
From: seemingwang <seemingwang@users.noreply.github.com>
Date: Mon, 28 Feb 2022 22:50:21 +0800
Subject: [PATCH 003/272] Move index sample (#39905)

* graph engine demo

* upload unsaved changes

* fix dependency error

* fix shard_num problem

* py client

* remove lock and graph-type

* add load direct graph

* add load direct graph

* add load direct graph

* batch random_sample

* batch_sample_k

* fix num_nodes size

* batch brpc

* batch brpc

* add test

* add test

* add load_nodes; change add_node function

* change sample return type to pair

* resolve conflict

* resolved conflict

* resolved conflict

* separate server and client

* merge pair type

* fix

* resolved conflict

* fixed segment fault; high-level VLOG for load edges and load nodes

* random_sample return 0

* rm useless loop

* test:load edge

* fix ret -1

* test: rm sample

* rm sample

* random_sample return future

* random_sample return int

* test fake node

* fixed here

* memory leak

* remove test code

* fix return problem

* add common_graph_table

* random sample node &test & change data-structure from linkedList to vector

* add common_graph_table

* sample with srand

* add node_types

* optimize nodes sample

* recover test

* random sample

* destruct weighted sampler

* GraphEdgeBlob

* WeightedGraphEdgeBlob to GraphEdgeBlob

* WeightedGraphEdgeBlob to GraphEdgeBlob

* pybind sample nodes api

* pull nodes with step

* fixed pull_graph_list bug; add test for pull_graph_list by step

* add graph table;name

* add graph table;name

* add pybind

* add pybind

* add FeatureNode

* add FeatureNode

* add FeatureNode Serialize

* add FeatureNode Serialize

* get_feat_node

* avoid local rpc

* fix get_node_feat

* fix get_node_feat

* remove log

* get_node_feat return  py:bytes

* merge develop with graph_engine

* fix threadpool.h head

* fix

* fix typo

* resolve conflict

* fix conflict

* recover lost content

* fix pybind of FeatureNode

* recover cmake

* recover tools

* resolve conflict

* resolve linking problem

* code style

* change test_server port

* fix code problems

* remove shard_num config

* remove redundent threads

* optimize start server

* remove logs

* fix code problems by reviewers' suggestions

* move graph files into a folder

* code style change

* remove graph operations from base table

* optimize get_feat function of graph engine

* fix long long count problem

* remove redandunt graph files

* remove unused shell

* recover dropout_op_pass.h

* fix potential stack overflow when request number is too large & node add & node clear & node remove

* when sample k is larger than neigbor num, return directly

* using random seed generator of paddle to speed up

* fix bug of random sample k

* fix code style

* fix code style

* add remove graph to fleet_py.cc

* fix blocking_queue problem

* fix style

* fix

* recover capacity check

* add remove graph node; add set_feature

* add remove graph node; add set_feature

* add remove graph node; add set_feature

* add remove graph node; add set_feature

* fix distributed op combining problems

* optimize

* remove logs

* fix MultiSlotDataGenerator error

* cache for graph engine

* fix type compare error

* more test&fix thread terminating problem

* remove header

* change time interval of shrink

* use cache when sample nodes

* remove unused function

* change unique_ptr to shared_ptr

* simplify cache template

* cache api on client

* fix

* reduce sample threads when cache is not used

* reduce cache memory

* cache optimization

* remove test function

* remove extra fetch function

* graph-engine data transfer optimization

* support graph_split load&query

* remove logs

* change shards to pointer vector

* use inference

* remove test code

* renorm op

* simplify renorm op

* recover local changes

* recover renorm op kernel

* fix init

* add blanklines in renorm doc

* fix import

* fix import

* add renorm to init.py

* merge

* move index_sample op

* Delete api.h

* Delete api.cc

* fix

* remove logs

* recover infer shape of grad

* recover changes

* change shape

* fix label

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

Co-authored-by: Huang Zhengjie <270018958@qq.com>
Co-authored-by: Weiyue Su <weiyue.su@gmail.com>
Co-authored-by: suweiyue <suweiyue@baidu.com>
Co-authored-by: luobin06 <luobin06@baidu.com>
Co-authored-by: liweibin02 <liweibin02@baidu.com>
Co-authored-by: tangwei12 <tangwei12@baidu.com>
---
 paddle/fluid/operators/index_sample_op.cc     |  61 +----
 paddle/fluid/operators/index_sample_op.cu     | 215 ------------------
 paddle/fluid/operators/index_sample_op.h      | 198 ----------------
 paddle/fluid/operators/index_sample_op_npu.cc |   3 +-
 paddle/phi/infermeta/binary.cc                |  35 +++
 paddle/phi/infermeta/binary.h                 |   5 +
 .../kernels/cpu/index_sample_grad_kernel.cc   | 106 +++++++++
 paddle/phi/kernels/cpu/index_sample_kernel.cc | 118 ++++++++++
 .../kernels/gpu/index_sample_grad_kernel.cu   | 146 ++++++++++++
 paddle/phi/kernels/gpu/index_sample_kernel.cu | 119 ++++++++++
 paddle/phi/kernels/index_sample_grad_kernel.h |  28 +++
 paddle/phi/kernels/index_sample_kernel.h      |  27 +++
 paddle/phi/ops/compat/index_sample_sig.cc     |  30 +++
 13 files changed, 623 insertions(+), 468 deletions(-)
 delete mode 100644 paddle/fluid/operators/index_sample_op.cu
 delete mode 100644 paddle/fluid/operators/index_sample_op.h
 create mode 100644 paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/index_sample_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/index_sample_kernel.cu
 create mode 100644 paddle/phi/kernels/index_sample_grad_kernel.h
 create mode 100644 paddle/phi/kernels/index_sample_kernel.h
 create mode 100644 paddle/phi/ops/compat/index_sample_sig.cc

diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 2d97797cfec..68d002fceea 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_sample_op.h"
 #include <vector>
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
 class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class IndexSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Input) of FindByIndex should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Index) of FindByIndex should not be null."));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(X) shape of IndexSample op should be 2-D, but "
-            "got X's shape = [%s], please check X shape.",
-            input_dims));
-
-    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(Index) shape of IndexSample op should be 2-D, but "
-            "got Index's shape [%s] , please check index shape.",
-            input_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Inputs(X)'s value of dimension 0 must same with "
-                            "Inputs(Index)'s value of dimension 0, but "
-                            "got %d of Inputs(X), and got %d of Inputs(Index), "
-                            "please check Inputs shape.",
-                            input_dims[0], index_dims[0]));
-    }
-    ctx->SetOutputDim("Out", index_dims);
-    auto type = ctx->GetInputsVarType("Index")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Index", /*->*/ "Out");
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PT_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                   ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
+                  IndexSampleInferShapeFunctor);
 REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
                   ops::IndexSampleGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
deleted file mode 100644
index e8acbfb8be9..00000000000
--- a/paddle/fluid/operators/index_sample_op.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_sample_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define PREDEFINED_BLOCK_SIZE_X 512
-#define PREDEFINED_BLOCK_SIZE 1024
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-namespace paddle {
-namespace operators {
-
-namespace {
-void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
-                          .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
-  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
-}
-}
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
-                                   T* out_data, size_t index_length,
-                                   size_t input_length, size_t batch_size) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
-    }
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
-                                const T* out_grad, size_t index_length,
-                                size_t input_length, size_t batch_size,
-                                bool same_data_in_row = true) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      if (same_data_in_row) {
-        platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
-                                out_grad[sample_idx]);
-      } else {
-        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
-      }
-    }
-  }
-}
-
-template <typename T>
-class IndexSampleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* index = ctx.Input<LoDTensor>("Index");
-    auto* output = ctx.Output<LoDTensor>("Out");
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    const auto* in_data = input->data<T>();
-    auto* out_data = output->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-
-    auto input_dim = input->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = input_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    int block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    }
-  }
-};
-
-template <typename T>
-class IndexSampleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<LoDTensor>("Index");
-
-    const auto* output_grad_data = output_grad->data<T>();
-    auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    auto input_num = input_grad->numel();
-    auto input_dim = input_grad->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = index_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-    bool same_data_in_index_row = index_length == 1 ? false : true;
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    auto block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h
deleted file mode 100644
index 6cc8ff04c54..00000000000
--- a/paddle/fluid/operators/index_sample_op.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T, typename IndexT = int>
-void IndexSampleInner(const framework::ExecutionContext &context,
-                      const LoDTensor &input, const LoDTensor &index,
-                      LoDTensor *output) {
-  auto input_dims = input.dims();
-  auto index_dims = index.dims();
-
-  int batch_size = input_dims[0];
-  auto value_length = input_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> input_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(input, context.device_context(),
-                                    &input_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  std::vector<T> res(index_ids_num);
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    T v = input_vec[v_i];
-    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
-            << " value = " << v;
-    res[i] = v;
-  }
-
-  auto ddim = phi::make_ddim({batch_size, index_length});
-  output->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(res, context.device_context(), output);
-  output->Resize(ddim);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input_var = ctx.InputVar("X");
-    auto *index_var = ctx.InputVar("Index");
-
-    auto &input_tensor = input_var->Get<LoDTensor>();
-    auto &index_tensor = index_var->Get<LoDTensor>();
-
-    auto *out_var = ctx.OutputVar("Out");
-    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
-    }
-  }
-};
-
-template <typename T, typename IndexT = int>
-void IndexSampleGradInner(const framework::ExecutionContext &context,
-                          const LoDTensor &out_grad, const LoDTensor &index,
-                          LoDTensor *x_grad) {
-  std::vector<T> out_grad_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(out_grad, context.device_context(),
-                                    &out_grad_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  auto index_dims = index.dims();
-  auto x_grad_dims = x_grad->dims();
-
-  auto value_length = x_grad_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> x_grad_vec(x_grad->numel(), 0);
-
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    x_grad_vec[v_i] += out_grad_vec[i];
-  }
-  x_grad->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
-  x_grad->Resize(x_grad_dims);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *index_var = context.InputVar("Index");
-    auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
-
-    auto &index_tensor = index_var->Get<LoDTensor>();
-    auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
-    auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
-                                   x_grad_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
-                                       x_grad_tensor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
index f460d0622bc..38eb5b45149 100644
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_sample_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index dfaabf7cae2..1905e33bd03 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Inputs(X) shape of IndexSample op should be 2-D, but "
+                        "got X's shape = [%s], please check X shape.",
+                        input_dims));
+
+  auto index_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      2,
+      errors::InvalidArgument(
+          "Inputs(Index) shape of IndexSample op should be 2-D, but "
+          "got Index's shape [%s] , please check index shape.",
+          input_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      index_dims[0],
+                      errors::InvalidArgument(
+                          "Inputs(X)'s value of dimension 0 must same with "
+                          "Inputs(Index)'s value of dimension 0, but "
+                          "got %d of Inputs(X), and got %d of Inputs(Index), "
+                          "please check Inputs shape.",
+                          input_dims[0],
+                          index_dims[0]));
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(index_dims);
+  out->share_lod(y);
+}
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 02750482dcc..a0140c9a579 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config = MetaConfig());
+
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
new file mode 100644
index 00000000000..006711ceef7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context, typename IndexT = int>
+void IndexSampleGradInner(const Context& context,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad) {
+  std::vector<T> out_grad_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(out_grad, context, &out_grad_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+
+  auto index_dims = index.dims();
+  auto x_grad_dims = x_grad->dims();
+
+  auto value_length = x_grad_dims[1];
+  auto index_length = index_dims[1];
+  int index_ids_num = index.numel();
+
+  std::vector<T> x_grad_vec(x_grad->numel(), 0);
+
+  for (int i = 0; i < index_ids_num; i++) {
+    int b = floor(i / index_length);
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample_grad) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        value_length,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample_grad) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    int v_i = b * value_length + static_cast<int>(index_vec[i]);
+    x_grad_vec[v_i] += out_grad_vec[i];
+  }
+  context.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(x_grad_vec, context, x_grad);
+  x_grad->Resize(x_grad_dims);
+}
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* x_grad) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  if (index_type == DataType::INT32) {
+    IndexSampleGradInner<T, Context, int>(ctx, out_grad, index, x_grad);
+  } else if (index_type == DataType::INT64) {
+    IndexSampleGradInner<T, Context, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
new file mode 100644
index 00000000000..21bf9faee13
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_kernel.h"
+#include <cmath>
+#include <fstream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context, typename IndexT = int>
+void IndexSampleInner(const Context &context,
+                      const DenseTensor &input,
+                      const DenseTensor &index,
+                      DenseTensor *output) {
+  auto input_dims = input.dims();
+  auto index_dims = index.dims();
+
+  int batch_size = input_dims[0];
+  auto value_length = input_dims[1];
+  auto index_length = index_dims[1];
+  int index_ids_num = index.numel();
+
+  std::vector<T> input_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(input, context, &input_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+
+  std::vector<T> res(index_ids_num);
+  for (int i = 0; i < index_ids_num; i++) {
+    int b = floor(i / index_length);
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        value_length,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+
+    int v_i = b * value_length + static_cast<int>(index_vec[i]);
+    T v = input_vec[v_i];
+    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
+            << " value = " << v;
+    res[i] = v;
+  }
+
+  auto ddim = phi::make_ddim({batch_size, index_length});
+  context.template Alloc<T>(output);
+  paddle::framework::TensorFromVector(res, context, output);
+  output->Resize(ddim);
+}
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context &ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &index,
+                       DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  if (index_type == DataType::INT32) {
+    IndexSampleInner<T, Context, int>(ctx, x, index, out);
+  } else if (index_type == DataType::INT64) {
+    IndexSampleInner<T, Context, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
new file mode 100644
index 00000000000..8b1ef964124
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace {
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+}
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+};
+
+template <typename T, typename IndexT = int>
+__global__ void IndexSampleGrad(const IndexT* index,
+                                T* in_grad,
+                                const T* out_grad,
+                                size_t index_length,
+                                size_t input_length,
+                                size_t batch_size,
+                                bool same_data_in_row = true) {
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    index_i = blockDim.x * blockIdx.x + threadIdx.x;
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      if (same_data_in_row) {
+        paddle::platform::CudaAtomicAdd(
+            &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]);
+      } else {
+        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* x_grad) {
+  const T* output_grad_data = out_grad.data<T>();
+  T* input_grad_data = ctx.template Alloc<T>(x_grad);
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  auto input_num = x.numel();
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  size_t batch_size = index_dim[0];
+  size_t input_length = input_dim[1];
+  size_t index_length = index_dim[1];
+  bool same_data_in_index_row = index_length == 1 ? false : true;
+
+  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
+  auto block_height =
+      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      block_width;
+  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
+  dim3 block_dim(block_width, block_height);
+  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
+                (batch_size + block_dim.y - 1) / block_dim.y);
+  LimitGridDim(ctx, &grid_dim);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, x_grad, static_cast<T>(0));
+
+  if (index_type == DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+        index_data,
+        input_grad_data,
+        output_grad_data,
+        index_length,
+        input_length,
+        batch_size,
+        same_data_in_index_row);
+  } else if (index_type == DataType::INT32) {
+    const int* index_data = index.data<int>();
+    IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
+        index_data,
+        input_grad_data,
+        output_grad_data,
+        index_length,
+        input_length,
+        batch_size,
+        same_data_in_index_row);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
new file mode 100644
index 00000000000..0e042089e1e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace {
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+}
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+}
+
+template <typename T, typename IndexT = int>
+__global__ void IndexSampleForward(const IndexT* index,
+                                   const T* in_data,
+                                   T* out_data,
+                                   size_t index_length,
+                                   size_t input_length,
+                                   size_t batch_size) {
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    index_i = blockDim.x * blockIdx.x + threadIdx.x;
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       DenseTensor* out) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  const T* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  size_t batch_size = input_dim[0];
+  size_t input_length = input_dim[1];
+  size_t index_length = index_dim[1];
+
+  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
+  int block_height =
+      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      block_width;
+  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
+  dim3 block_dim(block_width, block_height);
+  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
+                (batch_size + block_dim.y - 1) / block_dim.y);
+  LimitGridDim(ctx, &grid_dim);
+
+  if (index_type == DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+        index_data, in_data, out_data, index_length, input_length, batch_size);
+  } else if (index_type == DataType::INT32) {
+    const int* index_data = index.data<int>();
+    IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
+        index_data, in_data, out_data, index_length, input_length, batch_size);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/index_sample_grad_kernel.h b/paddle/phi/kernels/index_sample_grad_kernel.h
new file mode 100644
index 00000000000..5c6e101f1b4
--- /dev/null
+++ b/paddle/phi/kernels/index_sample_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_sample_kernel.h b/paddle/phi/kernels/index_sample_kernel.h
new file mode 100644
index 00000000000..fb43c0c6c5f
--- /dev/null
+++ b/paddle/phi/kernels/index_sample_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc
new file mode 100644
index 00000000000..0d2aed68a72
--- /dev/null
+++ b/paddle/phi/ops/compat/index_sample_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IndexSampleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_sample_grad",
+                         {GradVarName("Out"), "X", "Index"},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(index_sample_grad,
+                           phi::IndexSampleGradOpArgumentMapping);
-- 
GitLab


From d17961edc0f32f640861db93ed2e8660062ba2b7 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 1 Mar 2022 09:55:33 +0800
Subject: [PATCH 004/272] Optimize the CUDA kernel in DistributedFusedLamb
 optimizer (#39972)

* vectorize lamb kernel

* remove flags, add ut

* remove useless codes

* refine code, add param order
---
 .../distributed_fused_lamb_init_op.cc         |  39 +-
 .../distributed_fused_lamb_init_op.cu         | 162 ++---
 .../optimizers/distributed_fused_lamb_op.cc   |  34 +-
 .../optimizers/distributed_fused_lamb_op.cu   | 682 ++++++++++--------
 .../operators/optimizers/multi_tensor_apply.h |  61 +-
 .../distributed_fused_lamb_test_base.py       |   5 +
 .../optimizer/distributed_fused_lamb.py       |  21 +-
 7 files changed, 546 insertions(+), 458 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
index 28c6efef141..efec50efa92 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
               "The fp32 beta1 power accumulator tensor. Its shape is [1].");
     AddOutput("Beta2Pow",
               "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddOutput("FusedIndices",
-              "The param index of each element in FP32FusedParam. Its shape is "
-              "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
     AddOutput(
         "FusedParamOffsets",
         "The numel offset of each parameter inside the FP32FusedParam. Its "
         "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddOutput("FP32ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp32_local_param_num + 1].");
-    AddOutput("FP16ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp16_local_param_num + 1].");
+        "+ n_2, ...]. It should be in CPUPlace.");
     AddOutput(
-        "WeightDecay",
-        "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "FP32ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddOutput(
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
     AddOutput("ParamInfo",
               "The param info. It should be in CPUPlace, and its shape is [6]"
-              "CPUPlace, and its shape is [6]. It is "
+              "CPUPlace, and its shape is [8]. It is "
               "[fp32_shard_param_start_idx, fp32_local_param_num, "
-              "fp32_global_param_num, fp16_shard_param_start_idx, "
-              "fp16_local_param_num, fp16_global_param_num].");
-
+              "fp32_global_param_num, fp32_weight_decay_end_idx, "
+              "fp16_shard_param_start_idx, "
+              "fp16_local_param_num, fp16_global_param_num, "
+              "fp16_weight_decay_end_idx].");
+    AddOutput("ParamOrder",
+              "The reordered parameter order. Inside this op, "
+              "the parameter would be reordered by data type and weight decay "
+              "value.");
     AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
     AddOutput("MasterParamOut",
               "The output master parameter list. It would share the memory of "
@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker
 
     AddAttr<float>("beta1", "The initial value of Beta1Pow.");
     AddAttr<float>("beta2", "The initial value of Beta2Pow.");
-    AddAttr<std::vector<float>>(
-        "weight_decay",
-        "The weight decay for each parameter. Its "
-        "shape is equal to the global parameter number.");
+    AddAttr<std::vector<int>>("apply_weight_decay",
+                              "Whether to apply weight decay.");
     AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
     AddAttr<int>("rank", "The global rank of the current process.");
     AddAttr<int>("nranks", "The global world size.");
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3445e9b658b..7d8a7186d58 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
            << ") , dtype = " << fused_out->dtype();
 }
 
-template <typename OffsetT, typename IndexT>
-static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
-                                                      IndexT *out,
-                                                      int offset_num,
-                                                      int out_num) {
-  CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
-    auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
-    if (idx == offset_num || offsets[idx] != i) {
-      --idx;
-    }
-    out[i] = idx;
-  }
-}
-
-template <typename T>
-static void CopyVectorToTensor(const std::vector<T> &src,
-                               framework::Tensor *dst,
-                               const platform::Place &place,
-                               gpuStream_t stream) {
-  dst->Resize({static_cast<int64_t>(src.size())});
-  T *dst_ptr = dst->mutable_data<T>(place);
-  const T *src_ptr = src.data();
-  auto nbytes = src.size() * sizeof(T);
-  memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
-}
-
 template <typename T>
 static void CopyVectorToCPUTensor(const std::vector<T> &src,
                                   framework::Tensor *dst) {
@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
   std::memcpy(dst_ptr, src_ptr, nbytes);
 }
 
+static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
+                                       std::vector<ParamGradInfo> *infos) {
+  size_t n = infos->size();
+  std::vector<int> cur_flags;
+  cur_flags.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = (*infos)[i].idx;
+    cur_flags.push_back(flags[idx]);
+  }
+
+  auto origin_infos = *infos;
+  size_t j = 0;
+  for (size_t i = 0; i < n; ++i) {
+    if (cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  size_t ret_idx = j;
+
+  for (size_t i = 0; i < n; ++i) {
+    if (!cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  return ret_idx;
+}
+
+template <typename T>
+static T ClipByBound(T x, T low_value, T high_value) {
+  if (x < low_value) return low_value;
+  if (x > high_value) return high_value;
+  return x;
+}
+
 template <typename T>
 class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
         info->numel_offset = 0;        // not determined yet
       }
     }
+    const auto &apply_weight_decay =
+        ctx.Attr<std::vector<int>>("apply_weight_decay");
+    size_t fp32_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
+    size_t fp16_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
+
+    auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
+    auto param_num = fp32_infos.size() + fp16_infos.size();
+    param_order_t->Resize({static_cast<int16_t>(param_num)});
+    auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
+    for (size_t i = 0; i < fp32_infos.size(); ++i) {
+      param_order[i] = static_cast<int>(fp32_infos[i].idx);
+    }
+    for (size_t i = 0; i < fp16_infos.size(); ++i) {
+      param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
+    }
+
     VLOG(10) << "Fill ParamGradInfo ends";
 
     // Step 2: determine the numel_with_padding and numel_offset
@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     VLOG(10) << "Found the sharding arguments";
 
     auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
-    param_info_t->Resize({6});
+    param_info_t->Resize({8});
     auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
     param_info[0] = static_cast<int>(fp32_start_idx);
     param_info[1] = static_cast<int>(fp32_local_param_num);
     param_info[2] = static_cast<int>(fp32_infos.size());
-    param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size());
-    param_info[4] = static_cast<int>(fp16_local_param_num);
-    param_info[5] = static_cast<int>(fp16_infos.size());
+    param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
+                                     fp32_start_idx + fp32_local_param_num) -
+                    static_cast<int>(fp32_start_idx);
+    param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
+    param_info[5] = static_cast<int>(fp16_local_param_num);
+    param_info[6] = static_cast<int>(fp16_infos.size());
+    param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
+                                     fp16_start_idx + fp16_local_param_num) -
+                    static_cast<int>(fp16_start_idx);
 
     VLOG(10) << "Start FP32 idx: " << param_info[0];
     VLOG(10) << "Local FP32 param num: " << param_info[1];
     VLOG(10) << "Global FP32 param num: " << param_info[2];
 
-    VLOG(10) << "Start FP16 idx: " << param_info[3];
-    VLOG(10) << "Local FP16 param num: " << param_info[4];
-    VLOG(10) << "Global FP16 param num: " << param_info[5];
+    VLOG(10) << "Start FP16 idx: " << param_info[4];
+    VLOG(10) << "Local FP16 param num: " << param_info[5];
+    VLOG(10) << "Global FP16 param num: " << param_info[6];
 
-    // For WeightDecay, shard and perform H2D copy
-    const auto &origin_weight_decay =
-        ctx.Attr<std::vector<float>>("weight_decay");
-    PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "The attr(weight_decay) should have the "
-                          "same length with Input(Param)."));
-    std::vector<float> shard_weight_decay;
-    shard_weight_decay.reserve(total_local_param_num);
-    for (size_t i = 0; i < fp32_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
-    }
-    for (size_t i = 0; i < fp16_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
-    }
-
-    // For FusedIndices, launch CUDA kernel to do binary search
-    auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
-    fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
-    auto *fused_indices = fused_indices_t->mutable_data<int>(place);
     std::vector<int> numel_offsets;
     numel_offsets.reserve(params.size() + 1);
     for (const auto &info : fp32_infos) {
@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                           "The numel_offsets number must be one larger than "
                           "the parameter number."));
     VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
-    auto *fused_param_offset_t =
-        ctx.Output<framework::Tensor>("FusedParamOffsets");
-    fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
-    auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
-    memory::Copy(place, fused_param_offset, platform::CPUPlace(),
-                 numel_offsets.data(),
-                 numel_offsets.size() * sizeof(numel_offsets[0]), stream);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
-    LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
-                                     config.thread_per_block, 0, stream>>>(
-        fused_param_offset, fused_indices, numel_offsets.size() - 1,
-        total_numel);
-
-    std::vector<int> lengths;
-    lengths.reserve(fp32_local_param_num + fp16_local_param_num);
 
     std::vector<int> fp32_partial_numel_offsets;
     fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
       VLOG(10) << "FP32 Partial numel = ["
                << valid_start_n + fp32_infos[i].numel << ","
                << end_n + fp32_infos[i].numel;
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
       fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
     }
 
     std::vector<int> fp16_partial_numel_offsets;
@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
       PADDLE_ENFORCE_NE(valid_start_n, end_n,
                         platform::errors::InvalidArgument(
                             "Indices sharding error. This may be a bug."));
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
       fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
     }
 
     CopyVectorToCPUTensor(numel_offsets,
@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
         fp16_partial_numel_offsets,
         ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
 
-    // Fill the weight decay tensor
-    PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    std::vector<float> wd_cpu;
-    for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
-      int len = lengths[i];
-      for (int j = 0; j < len; ++j) {
-        wd_cpu.push_back(shard_weight_decay[i]);
-      }
-    }
-    PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
-                       place, stream);
-
     auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
     if (!global_scale->IsInitialized()) {
       TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index e5b27446eb3..8f7c87912e9 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
              "The fp32 beta1 power accumulator tensor. Its shape is [1].");
     AddInput("Beta2Pow",
              "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddInput("FusedIndices",
-             "The param index of each element in FP32FusedParam. Its shape is "
-             "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
     AddInput(
         "FusedParamOffsets",
         "The numel offset of each parameter inside the FP32FusedParam. Its "
         "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddInput("FP32ShardFusedParamOffsets",
-             "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp32_local_param_num + 1].");
-    AddInput("FP16ShardFusedParamOffsets",
-             "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp16_local_param_num + 1].");
-    AddInput("WeightDecay",
-             "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "+ n_2, ...]. It should be in CPUPlace.");
+    AddInput(
+        "FP32ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddInput(
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
     AddInput("ParamInfo",
              "The param info. It should be in CPUPlace, and its shape is [6]"
-             "CPUPlace, and its shape is [6]. It is "
+             "CPUPlace, and its shape is [8]. It is "
              "[fp32_shard_param_start_idx, fp32_local_param_num, "
-             "fp32_global_param_num, fp16_shard_param_start_idx, "
-             "fp16_local_param_num, fp16_global_param_num].");
+             "fp32_global_param_num, fp32_weight_decay_end_idx, "
+             "fp16_shard_param_start_idx, "
+             "fp16_local_param_num, fp16_global_param_num, "
+             "fp16_weight_decay_end_idx].");
+    AddInput("ParamOrder",
+             "The reordered parameter order. Inside this op, "
+             "the parameter would be reordered by data type and weight decay "
+             "value.");
 
     AddInput("LearningRate",
              "The fp32 learning rate tensor. Its shape is [1].");
@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         "max_global_grad_norm",
         "The maximum global gradient l2-norm value for clipping. If "
         "max_global_grad_norm <= 0, no clipping would be performed.");
+    AddAttr<float>("weight_decay", "The weight decay value.");
     AddAttr<bool>("clip_after_allreduce",
                   "Whether to clip before allreduce, only valid when the "
                   "world size is larger than 1.");
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 3f90140f772..ca0828a6f6a 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -87,7 +87,7 @@ struct L2NormFunctor {
   }
 };
 
-template <typename InT, typename OutT, int BlockDim, bool NeedSqrt>
+template <typename InT, typename OutT, int BlockDim>
 static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
     const InT *x, OutT *y, int max_chunk_num) {
   int tensor_id = blockIdx.x;
@@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
   }
   sum = BlockReduce(storage).Reduce(sum, cub::Sum());
   if (threadIdx.x == 0) {
-    if (NeedSqrt) {
-      y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
-    } else {
-      y[blockIdx.x] = static_cast<OutT>(sum);
-    }
+    y[blockIdx.x] = static_cast<OutT>(sum);
   }
 }
 
@@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
   constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
   constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  chunk_size *= sizeof(T);
   if (address % vec8 == 0 && chunk_size % vec8 == 0) {
     return std::min(8, valid_vec_size);
   } else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
@@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   }
 }
 
-#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \
-  case __vec_size: {                                    \
-    constexpr int kVecSize = __vec_size;                \
-    __VA_ARGS__;                                        \
-    break;                                              \
+#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \
+  case __vec_size: {                               \
+    constexpr int kVecSize = __vec_size;           \
+    __VA_ARGS__;                                   \
+    break;                                         \
   }
 
-#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...)    \
-  do {                                                \
-    switch (__vec_size) {                             \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \
-    }                                                 \
+#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...)    \
+  do {                                           \
+    switch (__vec_size) {                        \
+      PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \
+    }                                            \
   } while (0)
 
 // TODO(zengjinle): which chunk_size is better?
-template <typename InT, typename OutT, bool NeedSqrt = false,
-          int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680,
-          int BlockDim = 512>
+template <typename InT, typename OutT, int MaxTensorNumPerLaunch = 160,
+          int MaxChunkNumPerLaunch = 780>
 static void MultiTensorL2Norm(const platform::CUDAPlace &place,
                               gpuStream_t stream, const InT *x,
                               const int *offsets, int n, OutT *y,
@@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
 
   constexpr int kNumTensor = MaxTensorNumPerLaunch;
   constexpr int kNumChunk = MaxChunkNumPerLaunch;
-  constexpr int kBlockDim = BlockDim;
+  constexpr int kBlockDim = 512;
 
   int max_chunk_num = -1;
   int vec_size = 8;
@@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
   auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
   FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
 
-#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL                         \
-  do {                                                              \
-    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;       \
-    VLOG(10) << __func__ << " " << typeid(InT).name()               \
-             << " VecSize = " << kVecSize;                          \
-    MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>(   \
-        FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \
-        max_chunk_num);                                             \
+#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL                            \
+  do {                                                                         \
+    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;                  \
+    VLOG(10) << __func__ << " " << typeid(InT).name()                          \
+             << " VecSize = " << kVecSize;                                     \
+    MultiTensorApply<FunctorT, kNumTensor, kNumChunk>(                         \
+        FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \
+        max_chunk_num);                                                        \
   } while (0)
 
-  PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL);
-#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
+#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
 
-  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim,
-                                         NeedSqrt><<<n, kBlockDim, 0, stream>>>(
-      tmp_out_ptr, y, max_chunk_num);
+  MultiTensorL2NormReduceAgainCUDAKernel<
+      MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
+                                                        max_chunk_num);
 }
 
 template <int LogLevel>
@@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm(
   auto tensors = ctx.MultiInput<framework::Tensor>("Param");
   if (tensors.empty()) return;
 
+  const auto *order = ctx.Input<framework::Tensor>("ParamOrder")->data<int>();
+
   size_t n = tensors.size();
   auto place = tensors[0]->place();
 
   auto pn_vec = ToVector(param_square_norm, n, place);
   auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place);
 
-  std::vector<size_t> fp32_indices, fp16_indices;
-  fp32_indices.reserve(n);
-  fp16_indices.reserve(n);
-  for (size_t i = 0; i < n; ++i) {
-    const auto *t = tensors[i];
-    if (t->dtype() == phi::DataType::FLOAT32) {
-      fp32_indices.push_back(i);
-    } else if (t->dtype() == phi::DataType::FLOAT16) {
-      fp16_indices.push_back(i);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported data type %s.", t->dtype()));
-    }
-  }
-
-  for (auto idx : fp16_indices) {
-    fp32_indices.push_back(idx);
-  }
-
   const auto &names = ctx.GetOp().Inputs("Param");
-  for (size_t i = 0; i < fp32_indices.size(); ++i) {
-    auto idx = fp32_indices[i];
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = order[i];
     VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx]
                    << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i];
   }
@@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale(
     const T1 *__restrict__ global_scale, T1 max_global_grad_norm,
     const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1,
     T2 *__restrict__ out2, T1 clip_rescale_grad) {
-  T1 grad_norm = static_cast<T1>(sqrt(*square_grad_norm)) * clip_rescale_grad;
+  T1 grad_norm = static_cast<T1>(sqrtf(*square_grad_norm)) * clip_rescale_grad;
   T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm);
   bool found_nan_inf = !isfinite(scale);
   if (scale >= 1 || found_nan_inf) {
@@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1,
       ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f;
 }
 
-// TODO(zengjinle): Vectorize this function
-// NOTE: this method does not update Beta1Pow and Beta2Pow!
-template <typename T, typename GradT, typename IndexT>
-static __global__ void UpdateLambMoment(
+template <typename T, typename GradT, int VecSize>
+static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
     const T *__restrict__ param_p, const GradT *__restrict__ grad_p,
     const T *__restrict__ square_grad_norm_p,
-    const T *__restrict__ global_scale, const IndexT *__restrict__ indices,
-    const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p,
+    const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p,
     const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p,
-    T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2,
-    T epsilon, T max_global_grad_norm, int num, T rescale_grad) {
+    T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf,
+    T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon,
+    T max_global_grad_norm, int num, T rescale_grad) {
   T square_grad_norm = *square_grad_norm_p;
-  if (!isfinite(square_grad_norm)) return;
+  bool need_update_found_inf =
+      (found_inf && threadIdx.x == 0 && blockIdx.x == 0);
+  if (!isfinite(square_grad_norm)) {
+    if (need_update_found_inf) *found_inf = true;
+    return;
+  } else if (need_update_found_inf) {
+    *found_inf = false;
+  }
 
   T scale = rescale_grad / global_scale[0];
   if (max_global_grad_norm > 0) {
@@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment(
   T one_minus_beta1pow = 1 - beta1pow_p[0];
   T one_minus_beta2pow = 1 - beta2pow_p[0];
 
-  CUDA_KERNEL_LOOP(i, num) {
-    T p = param_p[i];
-    T g = static_cast<T>(grad_p[i]) * scale;
-    T weight_decay = weight_decay_p[i];
-    T mom1 = mom1_p[i];
-    T mom2 = mom2_p[i];
-
-    mom1 = beta1 * mom1 + (1 - beta1) * g;
-    mom2 = beta2 * mom2 + (1 - beta2) * g * g;
-
-    T mom1_unbiased = mom1 / one_minus_beta1pow;
-    T mom2_unbiased = mom2 / one_minus_beta2pow;
-    T trust_ratio_div =
-        mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p;
-
-    mom1_p[i] = mom1;
-    mom2_p[i] = mom2;
-    trust_ratio_div_p[i] = trust_ratio_div;
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = blockDim.x * gridDim.x * VecSize;
+
+  for (; i + VecSize <= num; i += stride) {
+    platform::AlignedVector<T, VecSize> param_vec;
+    platform::AlignedVector<GradT, VecSize> grad_vec;
+    platform::AlignedVector<T, VecSize> weight_decay_vec;
+    platform::AlignedVector<T, VecSize> mom1_vec;
+    platform::AlignedVector<T, VecSize> mom2_vec;
+    platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
+
+    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
+    if (cur_weight_decay != static_cast<T>(0.0)) {
+      platform::Load(param_p + i, &param_vec);
+    } else {
+#pragma unroll
+      for (int j = 0; j < VecSize; ++j) {
+        param_vec[j] = static_cast<T>(0);
+      }
+    }
+    platform::Load(grad_p + i, &grad_vec);
+    platform::Load(mom1_p + i, &mom1_vec);
+    platform::Load(mom2_p + i, &mom2_vec);
+
+#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2,    \
+                                           __trust_ratio_div, __idx)           \
+  T p = __param[__idx];                                                        \
+  T g = static_cast<T>(__grad[__idx]) * scale;                                 \
+  T mom1 = __mom1[__idx];                                                      \
+  T mom2 = __mom2[__idx];                                                      \
+  mom1 = beta1 * mom1 + (1 - beta1) * g;                                       \
+  mom2 = beta2 * mom2 + (1 - beta2) * g * g;                                   \
+  T mom1_unbiased = mom1 / one_minus_beta1pow;                                 \
+  T mom2_unbiased = mom2 / one_minus_beta2pow;                                 \
+  __trust_ratio_div[__idx] =                                                   \
+      mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \
+  __mom1[__idx] = mom1;                                                        \
+  __mom2[__idx] = mom2;
+
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec,
+                                         mom2_vec, trust_ratio_div_vec, j);
+    }
+
+    platform::Store(mom1_vec, mom1_p + i);
+    platform::Store(mom2_vec, mom2_p + i);
+    platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
+  }
+
+  for (; i < num; ++i) {
+    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
+    PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p,
+                                       trust_ratio_div_p, i);
   }
 }
 
+template <typename T, typename GradT>
+static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
+    const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
+    const T *param_p, const GradT *grad_p, const T *square_grad_norm_p,
+    const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p,
+    T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay,
+    int weight_decay_end_idx, T beta1, T beta2, T epsilon,
+    T max_global_grad_norm, T rescale_grad) {
+  if (n <= 0) return;
+  int numel = offsets[n] - offsets[0];
+  PADDLE_ENFORCE_GE(weight_decay_end_idx, 0,
+                    platform::errors::InvalidArgument(
+                        "The weight decay end index should be >= 0."));
+  PADDLE_ENFORCE_LE(weight_decay_end_idx, n,
+                    platform::errors::InvalidArgument(
+                        "The weight decay end index should be < %d.", n));
+  auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0];
+
+  int vec_size = GetChunkedVecSize(param_p, 0);
+  vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0));
+  for (int i = 0; i < n; ++i) {
+    auto length = offsets[i + 1] - offsets[i];
+    while (length % vec_size != 0) {
+      vec_size /= 2;
+    }
+  }
+
+  VLOG(1) << __func__ << " VecSize = " << vec_size;
+
+  auto stream = dev_ctx.stream();
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+
+#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                      \
+  do {                                                                 \
+    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<<  \
+        config.block_per_grid, config.thread_per_block, 0, stream>>>(  \
+        param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \
+        beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p,    \
+        weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,   \
+        max_global_grad_norm, numel, rescale_grad);                    \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
+#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
+}
+
 template <typename T, bool NeedUpdate /*=true*/>
 struct LambBetaPowUpdateOnceHelper {
   LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) {
@@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper<T, false> {
   HOSTDEVICE void UpdateBetaPows() const {}
 };
 
-template <bool HasFoundInf /*=true*/>
-struct LambFoundInfHelper {
- public:
-  explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) {
-    PADDLE_ENFORCE_NOT_NULL(found_inf,
-                            platform::errors::InvalidArgument(
-                                "The found_inf should not be nullptr."));
-  }
-
-  HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; }
-
- private:
-  bool *__restrict__ found_inf_;
-};
-
-template <>
-struct LambFoundInfHelper<false> {
- public:
-  explicit LambFoundInfHelper(bool *found_inf) {
-    PADDLE_ENFORCE_EQ(
-        found_inf, nullptr,
-        platform::errors::InvalidArgument("The found_inf should be nullptr."));
-  }
-
-  HOSTDEVICE void UpdateFoundInf(bool) {}
-};
-
 template <typename T, bool HasMasterParam /*=true*/>
 struct LambParamHelper {
   LambParamHelper(T *param, MasterT<T> *master_param) {
@@ -509,12 +551,9 @@ struct LambParamHelper {
     master_param_ = master_param;
   }
 
-  HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
-    param_[i] = static_cast<T>(updated_p);
-    master_param_[i] = updated_p;
-  }
+  HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
 
-  HOSTDEVICE MasterT<T> GetParam(int i) { return master_param_[i]; }
+  HOSTDEVICE MasterT<T> *__restrict__ MasterParamPtr() { return master_param_; }
 
  private:
   T *__restrict__ param_;
@@ -538,158 +577,169 @@ struct LambParamHelper<T, false> {
     param_ = param;
   }
 
-  HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
-    param_[i] = static_cast<T>(updated_p);
-  }
+  HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
 
-  HOSTDEVICE MasterT<T> GetParam(int i) {
-    return static_cast<MasterT<T>>(param_[i]);
-  }
+  HOSTDEVICE constexpr MasterT<T> *MasterParamPtr() { return nullptr; }
 
  private:
   T *__restrict__ param_;
 };
 
-template <typename ParamT, typename IndexT, bool HasMasterParam,
-          bool NeedUpdateBetaPow, bool HasFoundInf>
-struct LambParamAndBetaPowsUpdateHelper
-    : public LambParamHelper<ParamT, HasMasterParam>,
-      public LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>,
-      public LambFoundInfHelper<HasFoundInf> {
-  LambParamAndBetaPowsUpdateHelper(
-      ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
-      MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
-      bool *found_inf, const MasterT<ParamT> *trust_ratio_div,
-      const MasterT<ParamT> *lr, const IndexT *index,
+template <typename ParamT, bool HasMasterParam, bool NeedUpdateBetaPow,
+          int VecSize>
+struct LambUpdateParamAndBetaPowsFunctor {
+  DEVICE void operator()(
+      int tensor_id, int chunk_id, int offset, int size,
+      LambParamHelper<ParamT, HasMasterParam> param_helper,
+      const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
       const MasterT<ParamT> *param_square_norm,
-      const MasterT<ParamT> *trust_ratio_div_square_norm,
-      const MasterT<ParamT> *update_flag)
-      : LambParamHelper<ParamT, HasMasterParam>(param, master_param),
-        LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>(
-            beta1pow, beta2pow, beta1, beta2),
-        LambFoundInfHelper<HasFoundInf>(found_inf),
-        trust_ratio_div(trust_ratio_div),
-        lr(lr),
-        index(index),
-        param_square_norm(param_square_norm),
-        trust_ratio_div_square_norm(trust_ratio_div_square_norm),
-        update_flag(update_flag) {}
-
-  const MasterT<ParamT> *__restrict__ trust_ratio_div;
-  const MasterT<ParamT> *__restrict__ lr;
-  const IndexT *__restrict__ index;
-  const MasterT<ParamT> *__restrict__ param_square_norm;
-  const MasterT<ParamT> *__restrict__ trust_ratio_div_square_norm;
-  const MasterT<ParamT> *__restrict__ update_flag;
-};
+      const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
+      LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>
+          betapow_helper) const {
+    if (*found_inf) return;
+
+    using MT = MasterT<ParamT>;
 
-template <typename ParamT, typename IndexT, bool HasMasterParam,
-          bool NeedUpdateBetaPow, bool HasFoundInf>
-static __global__ void LambUpdateParamAndBetaPowsCUDAKernel(
-    LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, HasMasterParam,
-                                     NeedUpdateBetaPow, HasFoundInf>
-        args,
-    int num) {
-  auto should_update = *args.update_flag;
-  if (!isfinite(should_update)) {
-    if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
-      args.UpdateFoundInf(true);
+    MT p_square_norm = param_square_norm[tensor_id];
+    MT t_square_norm = trust_ratio_div_square_norm[tensor_id];
+    MT lr_value = *lr;
+    MT ratio = (p_square_norm != static_cast<MT>(0) &&
+                        t_square_norm != static_cast<MT>(0)
+                    ? lr_value * sqrtf(p_square_norm / t_square_norm)
+                    : lr_value);
+
+    int i;
+    int stride = blockDim.x * VecSize;
+
+    ParamT *param = param_helper.ParamPtr() + offset;
+    MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset
+                                      : param_helper.MasterParamPtr();
+    trust_ratio_div += offset;
+
+    for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
+      platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
+      platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
+      if (HasMasterParam) {
+        platform::AlignedVector<MT, VecSize> master_param_vec;
+        platform::Load(master_param + i, &master_param_vec);
+        platform::AlignedVector<ParamT, VecSize> param_vec;
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
+          master_param_vec[j] = p;
+          param_vec[j] = static_cast<ParamT>(p);
+        }
+        platform::Store(master_param_vec, master_param + i);
+        platform::Store(param_vec, param + i);
+      } else {
+        platform::AlignedVector<ParamT, VecSize> param_vec;
+        platform::Load(param + i, &param_vec);
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
+          param_vec[j] = static_cast<ParamT>(p);
+        }
+        platform::Store(param_vec, param + i);
+      }
+    }
+
+    for (; i < size; ++i) {
+      if (HasMasterParam) {
+        MT p = master_param[i] - ratio * trust_ratio_div[i];
+        master_param[i] = p;
+        param[i] = static_cast<ParamT>(p);
+      } else {
+        MT p = static_cast<MT>(param[i]) - ratio * trust_ratio_div[i];
+        param[i] = static_cast<ParamT>(p);
+      }
+    }
+
+    if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
+      betapow_helper.UpdateBetaPows();
     }
-    return;
-  } else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
-    args.UpdateFoundInf(false);
   }
+};
 
-  if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
-    args.UpdateBetaPows();
+// TODO(zengjinle): which block_dim and chunk_size would be better?
+template <typename ParamT, int MaxTensorNumPerLaunch = 160,
+          int MaxChunkNumPerLaunch = 780>
+static void MultiTensorUpdateLambParamAndBetaPows(
+    const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
+    const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
+    const MasterT<ParamT> *param_square_norm,
+    const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
+    ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
+    MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
+    int chunk_size = 65536) {
+  constexpr bool kHasMasterParam =
+      !(std::is_same<ParamT, MasterT<ParamT>>::value);
+
+  bool has_beta_pow = (beta1pow != nullptr);
+  if (has_beta_pow) {
+    PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
+                                          "Beta2Pow should not be nullptr."));
+  } else {
+    PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
+                                             "Beta2Pow should be nullptr."));
   }
 
-  using MT = MasterT<ParamT>;
+  const int block_dim = 512;
 
-  MT lr_value = *args.lr;
-  CUDA_KERNEL_LOOP(i, num) {
-    MT p = args.GetParam(i);
-    MT t = args.trust_ratio_div[i];
-    auto norm_idx = args.index[i];
-    MT p_square_norm = args.param_square_norm[norm_idx];
-    MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx];
+  int vec_size = 8;
+  for (int i = 0; i < n; ++i) {
+    int offset = offsets[i] - offsets[0];
+    vec_size =
+        std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size));
+    if (kHasMasterParam) {
+      vec_size = std::min(vec_size,
+                          GetChunkedVecSize(master_param + offset, chunk_size));
+    }
+    vec_size = std::min(
+        vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size));
+  }
 
-    MT p_norm = static_cast<MT>(sqrtf(p_square_norm));
-    MT t_norm = static_cast<MT>(sqrtf(t_square_norm));
+  VLOG(1) << __func__ << " VecSize = " << vec_size;
 
-    auto update = (p_norm != static_cast<MT>(0) && t_norm != static_cast<MT>(0))
-                      ? p_norm / t_norm
-                      : static_cast<MT>(1);
+  constexpr auto kNumTensor = MaxTensorNumPerLaunch;
+  constexpr auto kNumChunk = MaxChunkNumPerLaunch;
 
-    MT updated_p = p - lr_value * update * t;
-    args.SetParam(i, updated_p);
-  }
-}
+  auto stream = dev_ctx.stream();
+#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow)            \
+  do {                                                                         \
+    using FunctorT =                                                           \
+        LambUpdateParamAndBetaPowsFunctor<ParamT, kHasMasterParam,             \
+                                          __has_beta_pow, kVecSize>;           \
+    LambParamHelper<ParamT, kHasMasterParam> param_helper(param,               \
+                                                          master_param);       \
+    LambBetaPowUpdateOnceHelper<MasterT<ParamT>, __has_beta_pow>               \
+        betapow_helper(beta1pow, beta2pow, beta1, beta2);                      \
+    launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr,             \
+                    param_square_norm, trust_ratio_div_square_norm, found_inf, \
+                    betapow_helper);                                           \
+  } while (0)
 
-template <typename ParamT, typename IndexT>
-static void LambUpdateParamAndBetaPows(
-    const platform::CUDADeviceContext &dev_ctx,
-    const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
-    const IndexT *index, const MasterT<ParamT> *param_square_norm,
-    const MasterT<ParamT> *trust_ratio_div_square_norm,
-    const MasterT<ParamT> *update_flag, MasterT<ParamT> **beta1pow,
-    MasterT<ParamT> **beta2pow, bool **found_inf, MasterT<ParamT> beta1,
-    MasterT<ParamT> beta2, int num, ParamT *param,
-    MasterT<ParamT> *master_param, gpuStream_t stream) {
-  if (num == 0) return;
-
-  bool has_master_param = !(std::is_same<ParamT, MasterT<ParamT>>::value);
-  auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr;
-  auto has_found_inf = (*found_inf) != nullptr;
-
-#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(                              \
-    __has_master_param, __has_beta_pow, __has_found_inf)                     \
-  do {                                                                       \
-    LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, __has_master_param,     \
-                                     __has_beta_pow, __has_found_inf>        \
-        helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2,      \
-               *found_inf, trust_ratio_div, lr, index, param_square_norm,    \
-               trust_ratio_div_square_norm, update_flag);                    \
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num);              \
-    LambUpdateParamAndBetaPowsCUDAKernel<<<                                  \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \
-                                                                     num);   \
+#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE        \
+  do {                                                              \
+    auto callback = [&](                                            \
+        const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
+        int launch_n) {                                             \
+      if (has_beta_pow && launch_n == 0) {                          \
+        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
+        beta1pow = nullptr;                                         \
+        beta2pow = nullptr;                                         \
+      } else {                                                      \
+        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
+      }                                                             \
+    };                                                              \
+    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(            \
+        stream, offsets, n, chunk_size, block_dim, callback);       \
   } while (0)
 
-  if (has_master_param) {
-    if (has_beta_pow) {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false);
-      }
-    } else {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false);
-      }
-    }
-  } else {
-    if (has_beta_pow) {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false);
-      }
-    } else {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false);
-      }
-    }
-  }
+  PD_VEC_LAUNCH_KERNEL(vec_size,
+                       PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE);
 
-  *beta1pow = nullptr;
-  *beta2pow = nullptr;
-  *found_inf = nullptr;
-#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL
+#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW
+#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "Too many parameter number. Only <= %d is supported.",
                           std::numeric_limits<int>::max()));
 
-    // Step 3: Get FusedIndices, ParamInfo
-    const auto *indices = GetInputTensorPtr<int>(ctx, "FusedIndices");
+    // Step 3: Get ParamInfo
     const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
     auto fp32_local_start_idx = param_info_tensor[0];
     auto fp32_local_param_num = param_info_tensor[1];
     auto fp32_global_param_num = param_info_tensor[2];
-    auto fp16_local_start_idx = param_info_tensor[3];
-    auto fp16_local_param_num = param_info_tensor[4];
-    auto fp16_global_param_num = param_info_tensor[5];
+    auto fp32_weight_decay_end_idx = param_info_tensor[3];
+    auto fp16_local_start_idx = param_info_tensor[4];
+    auto fp16_local_param_num = param_info_tensor[5];
+    auto fp16_global_param_num = param_info_tensor[6];
+    auto fp16_weight_decay_end_idx = param_info_tensor[7];
 
     auto local_param_num = fp32_local_param_num + fp16_local_param_num;
     auto param_num = fp32_global_param_num + fp16_global_param_num;
@@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             << " , fp16_global_param_num = " << fp16_global_param_num;
 
     // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
-    // WeightDecay, GlobalScale, FoundInf
+    // GlobalScale, FoundInf
     const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
     const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
     int64_t partial_numel = 0;
@@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         GetSameInOutTensorPtr<float>(ctx, place, "Beta1Pow", "Beta1PowOut");
     auto *beta2pow =
         GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
-    const float *weight_decay = GetInputTensorPtr<float>(ctx, "WeightDecay");
 
     auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
     found_inf_t->Resize({1});
     auto *found_inf = found_inf_t->mutable_data<bool>(place);
 
-    // Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id,
+    // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
+    // max_grad_norm, ring_id,
     // use_master_param_norm, is_grad_scaled_by_nranks
+    auto weight_decay = ctx.Attr<float>("weight_decay");
     auto beta1 = ctx.Attr<float>("beta1");
     auto beta2 = ctx.Attr<float>("beta2");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     platform::float16 *fp16_sum_grad;
     auto fp32_numel_each_device = fp32_numel / num_devices;
     auto fp16_numel_each_device = fp16_numel / num_devices;
-    if (num_devices > 1) {
+    if (num_devices > 1 ||
+        (max_global_grad_norm > 0 && !clip_after_allreduce)) {
       auto ptr = sum_grad_buffer.Alloc<uint8_t>(
           fp32_numel_each_device * sizeof(float) +
           fp16_numel_each_device * sizeof(platform::float16));
@@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             float, platform::float16><<<1, 1, 0, stream>>>(
             global_scale, max_global_grad_norm, fp32_square_grad_norm,
             fp32_scale, fp16_scale, clip_scale);
-        VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
+        if (fp32_scale) {
+          VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
+        } else {
+          VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
+        }
         if (num_devices > 1) {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
               fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
@@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     VLOG(10) << "ReduceScatter done";
 
     // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div
+    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
+    auto *fused_offsets = fused_offsets_t->data<int>();
+    auto *fp32_partial_fused_offsets_t =
+        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
+    const auto *fp32_partial_fused_offsets =
+        fp32_partial_fused_offsets_t->data<int>();
+    auto *fp16_partial_fused_offsets_t =
+        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
+    const auto *fp16_partial_fused_offsets =
+        fp16_partial_fused_offsets_t->data<int>();
+
+    VLOG(1) << "FusedParamOffsets: "
+            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
+                               fused_offsets_t->place());
+    VLOG(1) << "FP32ShardFusedParamOffsets: "
+            << FlattenToString(fp32_partial_fused_offsets,
+                               fp32_partial_fused_offsets_t->numel(),
+                               fp32_partial_fused_offsets_t->place());
+    VLOG(1) << "FP16ShardFusedParamOffsets: "
+            << FlattenToString(fp16_partial_fused_offsets,
+                               fp16_partial_fused_offsets_t->numel(),
+                               fp16_partial_fused_offsets_t->place());
+
     memory::Buffer trust_ratio_div_buffer(place);
     auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
     auto fp32_offset = rank * fp32_numel_each_device;
     auto fp16_offset = rank * fp16_numel_each_device;
     if (has_fp32_param) {
-      auto config =
-          platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device);
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts";
-      UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
-                         stream>>>(
+      MultiTensorUpdateLambMomentAndTrustRatioDiv(
+          dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
           fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm,
-          global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow,
-          moment1, moment2, trust_ratio_div, beta1, beta2, epsilon,
-          max_global_grad_norm, fp32_numel_each_device, rescale_grad);
+          global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div,
+          found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2,
+          epsilon, max_global_grad_norm, rescale_grad);
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv done";
     }
     float *master_param = nullptr;
     if (has_fp16_param) {
       master_param = fp32_param + fp32_numel;
-      auto config =
-          platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device);
       VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts";
-      UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
-                         stream>>>(
+      auto tmp_found_inf = has_fp32_param ? nullptr : found_inf;
+      MultiTensorUpdateLambMomentAndTrustRatioDiv(
+          dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
           master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm,
-          global_scale, indices + fp32_numel + fp16_offset, weight_decay,
-          beta1pow, beta2pow, moment1 + fp32_numel_each_device,
+          global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device,
           moment2 + fp32_numel_each_device,
-          trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon,
-          max_global_grad_norm, fp16_numel_each_device, rescale_grad);
+          trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay,
+          fp16_weight_decay_end_idx, beta1, beta2, epsilon,
+          max_global_grad_norm, rescale_grad);
       VLOG(10) << "Update FP16 Moment and TrustRatioDiv done";
     }
 
@@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     memory::Buffer square_norm_buffer(place);
     auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
     auto *trust_ratio_div_square_norm = param_square_norm + param_num;
-
-    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
-    auto *fused_offsets = fused_offsets_t->data<int>();
-    auto *fp32_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
-    const auto *fp32_partial_fused_offsets =
-        fp32_partial_fused_offsets_t->data<int>();
-    auto *fp16_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
-    const auto *fp16_partial_fused_offsets =
-        fp16_partial_fused_offsets_t->data<int>();
-
-    VLOG(1) << "FusedParamOffsets: "
-            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
-                               fused_offsets_t->place());
-    VLOG(1) << "FP32ShardFusedParamOffsets: "
-            << FlattenToString(fp32_partial_fused_offsets,
-                               fp32_partial_fused_offsets_t->numel(),
-                               fp32_partial_fused_offsets_t->place());
-    VLOG(1) << "FP16ShardFusedParamOffsets: "
-            << FlattenToString(fp16_partial_fused_offsets,
-                               fp16_partial_fused_offsets_t->numel(),
-                               fp16_partial_fused_offsets_t->place());
-
     if (num_devices > 1) {
       if (use_master_param_norm) {
         FillZeroWithPtr(param_square_norm + fp32_global_param_num,
@@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                         fp16_partial_fused_offsets, fp16_local_param_num,
                         param_square_norm + fp16_local_start_idx);
     } else {
-      // NOTE: extra computation is performed. We can improve this performance
-      // if needed in the future.
       MultiTensorL2Norm(
-          place, stream, fp16_param, fused_offsets + fp32_global_param_num,
-          fp16_global_param_num, param_square_norm + fp32_global_param_num);
+          place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
+                             fused_offsets[fp32_global_param_num],
+          fused_offsets + fp16_local_start_idx, fp16_local_param_num,
+          param_square_norm + fp16_local_start_idx);
     }
 
     MultiTensorL2Norm(place, stream, trust_ratio_div,
@@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
     // Step 9: update parameter, beta1pow, beta2pow. All gather parameters.
     if (has_fp32_param) {
-      LambUpdateParamAndBetaPows<float>(
-          dev_ctx, trust_ratio_div, lr, indices + fp32_offset,
-          param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm,
-          &beta1pow, &beta2pow, &found_inf, beta1, beta2,
-          fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream);
+      MultiTensorUpdateLambParamAndBetaPows<float>(
+          dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
+          trust_ratio_div, lr, param_square_norm + fp32_local_start_idx,
+          trust_ratio_div_square_norm + fp32_local_start_idx, found_inf,
+          fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2);
       if (num_devices > 1) {
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
             fp32_param + fp32_offset, fp32_param, fp32_numel_each_device,
             ncclFloat32, comm, stream));
       }
+
+      beta1pow = nullptr;
+      beta2pow = nullptr;
     }
     if (has_fp16_param) {
-      LambUpdateParamAndBetaPows<platform::float16>(
-          dev_ctx, trust_ratio_div + fp32_numel_each_device, lr,
-          indices + fp32_numel + fp16_offset, param_square_norm,
-          trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow,
-          &beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device,
-          fp16_param + fp16_offset, master_param + fp16_offset, stream);
-
+      MultiTensorUpdateLambParamAndBetaPows<platform::float16>(
+          dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
+          trust_ratio_div + fp32_numel_each_device, lr,
+          param_square_norm + fp16_local_start_idx,
+          trust_ratio_div_square_norm + fp16_local_start_idx, found_inf,
+          fp16_param + fp16_offset, master_param + fp16_offset, beta1pow,
+          beta2pow, beta1, beta2);
       if (num_devices > 1) {
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
index 5d8d03c733d..179e8f45254 100644
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
           args...);
 }
 
-template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
-          int MaxChunkNumPerLaunch, typename... Args>
-static void MultiTensorApply(Functor functor, gpuStream_t stream,
-                             const int *offsets, int n, int chunk_size,
-                             Args... args) {
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
+class MultiTensorLauncher {
+ public:
+  MultiTensorLauncher(
+      const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
+      const int &chunk_id, const int &chunk_size, const int &block_dim,
+      const gpuStream_t &stream)
+      : meta_(meta),
+        chunk_id_(chunk_id),
+        chunk_size_(chunk_size),
+        block_dim_(block_dim),
+        stream_(stream) {}
+
+  template <typename Functor, typename... Args>
+  void Launch(Functor &&functor, Args &&... args) const {
+    MultiTensorApplyCUDAKernel<
+        Functor, MaxTensorNumPerLaunch,
+        MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
+        functor, meta_, chunk_size_, args...);
+  }
+
+ private:
+  const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
+  const int &chunk_id_;
+  const int &chunk_size_;
+  const int &block_dim_;
+  const gpuStream_t &stream_;
+};
+
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename Callback>
+static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
+                                         int n, int chunk_size, int block_dim,
+                                         Callback &&callback) {
   if (n == 0) return;
 
   constexpr auto NumTensor = MaxTensorNumPerLaunch;
@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
   int numel_offset = 0;
   metas.start_tensor_id = 0;
   metas.start_chunk_id = 0;
+  int launch_num = 0;
+
+  MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
+      metas, chunk_id, chunk_size, block_dim, stream);
+
   for (int i = 0; i < n; ++i) {
     auto length = offsets[i + 1] - offsets[i];
     if (tensor_id == 0) {
@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
       bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
 
       if (tensor_full || block_full || last_chunk) {
-        MultiTensorApplyCUDAKernel<Functor, NumTensor,
-                                   NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
-            functor, metas, chunk_size, args...);
+        callback(launcher, launch_num);
+        ++launch_num;
         chunk_id = 0;
         if (j + 1 == chunk_num) {  // chunk for the current tensor is full
           metas.start_chunk_id = 0;
@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
   }
 }
 
+template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename... Args>
+static void MultiTensorApply(Functor functor, gpuStream_t stream,
+                             const int *offsets, int n, int chunk_size,
+                             int block_dim, Args &&... args) {
+  auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
+                                                MaxChunkNumPerLaunch> &launcher,
+                      int i) { launcher.Launch(functor, args...); };
+  MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
+      stream, offsets, n, chunk_size, block_dim, callback);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index e0529c5d5f8..00d2a1f71d6 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             grad_clip = kwargs.get('grad_clip', None)
             clip_after_allreduce = kwargs.get('clip_after_allreduce', True)
 
+            parameters = [p.name for p in main.all_parameters()]
+            exclude_fn = lambda var: var.name in parameters[::4]
+            kwargs['exclude_from_weight_decay_fn'] = exclude_fn
+            kwargs['lamb_weight_decay'] = 0.1
+
             if use_distributed_lamb:
                 optimizer_class = DistributedFusedLamb
                 kwargs = dict(kwargs)
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index e7c3cfbb7b9..cc33a909632 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer):
         moment2.is_distributed = True
         beta1pow = self._create_persistable_var('beta1pow')
         beta2pow = self._create_persistable_var('beta2pow')
-        fused_indices = self._create_persistable_var(
-            'fused_indices', dtype='int32')
-        weight_decay = self._create_persistable_var('weight_decay')
-        weight_decay.is_distributed = True
+
         param_info = self._create_persistable_var('param_info', dtype='int32')
         param_info.is_distributed = True
 
@@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer):
             'fp16_partial_fused_offsets', dtype='int32')
         fp16_partial_fused_offsets.is_distributed = True
 
+        param_order = self._create_persistable_var('param_order', dtype='int32')
+        param_order.is_distributed = True
+
         rank = get_rank()
         nranks = get_world_size()
         scale = self._get_or_create_scale()
 
         params = [p for p, _ in params_grads]
         grads = [g for _, g in params_grads]
-        weight_decay_values = [self._weight_decay] * len(params)
+        apply_weight_decay = [1] * len(params)
         if self._exclude_from_weight_decay_fn is not None:
             for i, p in enumerate(params):
                 if self._exclude_from_weight_decay_fn(p):
-                    weight_decay_values[i] = 0.0
+                    apply_weight_decay[i] = 0
 
         startup_block = self.helper.startup_program.global_block()
         for g in grads:
@@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer):
                 'Moment2': [moment2],
                 'Beta1Pow': [beta1pow],
                 'Beta2Pow': [beta2pow],
-                'FusedIndices': [fused_indices],
-                'WeightDecay': [weight_decay],
                 'GlobalScale': [scale],
                 'ParamInfo': [param_info],
                 'ParamOut': params,
@@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer):
                 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
                 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
                 'FusedParamOffsets': [fused_offsets],
+                'ParamOrder': [param_order],
             },
             attrs={
                 'alignment': self._alignment,
                 'rank': rank,
                 'nranks': nranks,
-                'weight_decay': weight_decay_values,
+                'apply_weight_decay': apply_weight_decay,
                 'moment1': 0.0,
                 'moment2': 0.0,
                 'beta1': self._beta1,
@@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer):
                 'Moment2': [moment2],
                 'Beta1Pow': [beta1pow],
                 'Beta2Pow': [beta2pow],
-                'FusedIndices': [fused_indices],
-                'WeightDecay': [weight_decay],
                 'GlobalScale': [scale],
                 'ParamInfo': [param_info],
                 'Param': params,
@@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer):
                 'FusedParamOffsets': [fused_offsets],
                 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
                 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
+                'ParamOrder': [param_order],
             },
             outputs={
                 'FP32FusedParamOut': [fp32_fused_param],
@@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer):
                 'FoundInf': [self._found_inf],
             },
             attrs={
+                'weight_decay': self._weight_decay,
                 'beta1': self._beta1,
                 'beta2': self._beta2,
                 'epsilon': self._epsilon,
-- 
GitLab


From 4149cabeec527fa171a45a10ab21ba7fd1374a3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Tue, 1 Mar 2022 10:00:01 +0800
Subject: [PATCH 005/272] add type constrait for DenseTensor (#39967)

---
 paddle/infrt/dialect/infrt/infrt_ops_base.td         |  6 ++++++
 paddle/infrt/dialect/init_infrt_dialects.cc          |  4 ++--
 paddle/infrt/dialect/phi/CMakeLists.txt              | 11 +----------
 paddle/infrt/dialect/phi/ir/CMakeLists.txt           |  9 +++++++++
 paddle/infrt/dialect/phi/{ => ir}/infrt_phi_base.td  |  0
 .../infrt/dialect/phi/{ => ir}/infrt_phi_kernel.td   |  2 +-
 .../infrt/dialect/phi/{ => ir}/infrt_phi_tensor.cc   | 10 +++++-----
 paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.h |  8 ++++----
 .../infrt/dialect/phi/{ => ir}/infrt_phi_tensor.td   |  2 +-
 paddle/infrt/dialect/phi/{ => ir}/phi_base.cc        | 12 ++++++------
 paddle/infrt/dialect/phi/{ => ir}/phi_base.h         |  8 +++++---
 11 files changed, 40 insertions(+), 32 deletions(-)
 create mode 100644 paddle/infrt/dialect/phi/ir/CMakeLists.txt
 rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_base.td (100%)
 rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_kernel.td (92%)
 rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.cc (71%)
 rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.h (83%)
 rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.td (97%)
 rename paddle/infrt/dialect/phi/{ => ir}/phi_base.cc (84%)
 rename paddle/infrt/dialect/phi/{ => ir}/phi_base.h (84%)

diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index 81d3d028a66..978b126d754 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   );
 }
 
+// Type Constrait for concrete DenseTensor type.
+class DenseTensor<string target, string precision, string layout> :
+    Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
+    "!infrt.DenseTensor<"#target#","#precision#","#layout#">", 
+    "::infrt::DenseTensorType">;
+
 // Base class for infrt dialect attributes.
 class Infrt_Attr<string name, list<Trait> traits = [],
                    string baseCppClass = "::mlir::Attribute">
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index b5b8de7a20d..c5c81b4b0f2 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -21,8 +21,8 @@
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index 626b02c1f79..d477b6b9bdc 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI)
     return()
 endif()
 
-#mlir_tablegen_on(infrt_phi_base DIALECT phi)
-add_mlir_dialect(infrt_phi_base phi)
-add_mlir_dialect(infrt_phi_tensor phi_dt)
-add_mlir_dialect(infrt_phi_kernel phi_kernel)
-#mlir_tablegen_on(infrt_phi_tensor)
-
-gather_srcs(infrt_src SRCS
-    phi_base.cc infrt_phi_tensor.cc
-    infrt_phi_tensor.cc)
-
+add_subdirectory(ir)
 add_subdirectory(pass)
 
 add_executable(phi-exec phi_exec.cc)
diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
new file mode 100644
index 00000000000..8c1d75629d0
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
@@ -0,0 +1,9 @@
+#mlir_tablegen_on(infrt_phi_base DIALECT phi)
+add_mlir_dialect(infrt_phi_base phi)
+add_mlir_dialect(infrt_phi_tensor phi_dt)
+add_mlir_dialect(infrt_phi_kernel phi_kernel)
+#mlir_tablegen_on(infrt_phi_tensor)
+
+gather_srcs(infrt_src SRCS
+    phi_base.cc 
+    infrt_phi_tensor.cc)
diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
similarity index 100%
rename from paddle/infrt/dialect/phi/infrt_phi_base.td
rename to paddle/infrt/dialect/phi/ir/infrt_phi_base.td
diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
similarity index 92%
rename from paddle/infrt/dialect/phi/infrt_phi_kernel.td
rename to paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index 879994907cc..37bf0b5ef21 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def PHI_KernelDialect : Dialect {
   let name = "phi_kernel";
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
similarity index 71%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.cc
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
index 9df1a47031b..64780294be9 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 
 #include <mlir/IR/BuiltinTypes.h>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"
 
 namespace infrt {
 namespace phi {
@@ -25,7 +25,7 @@ namespace phi {
 void PHIDenseTensorDialect::initialize() {
 #define GET_OP_LIST
   addOperations<
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
       >();
 }
 
@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
similarity index 83%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.h
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
index 2780f975918..9a92558daab 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
@@ -29,11 +29,11 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
similarity index 97%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.td
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index b7b3b061fdb..dc3a4b340d7 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -2,7 +2,7 @@
 #else
 #define PHI_TENSOR
 
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
diff --git a/paddle/infrt/dialect/phi/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
similarity index 84%
rename from paddle/infrt/dialect/phi/phi_base.cc
rename to paddle/infrt/dialect/phi/ir/phi_base.cc
index a1caa40f638..7a6b3f3f0a4 100644
--- a/paddle/infrt/dialect/phi/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
@@ -21,8 +21,8 @@
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
 
 namespace infrt {
 namespace phi {
@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type,
 void PHIDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
       >();
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
       >();
 }
 
@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
 }  // namespace infrt
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
similarity index 84%
rename from paddle/infrt/dialect/phi/phi_base.h
rename to paddle/infrt/dialect/phi/ir/phi_base.h
index 11174290f92..a08d8229fcc 100644
--- a/paddle/infrt/dialect/phi/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -19,11 +19,13 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
 
 namespace mlir {
 namespace OpTrait {
-- 
GitLab


From 75280d36afe1e5e4aab0df51a9d7ee0828ee12fa Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 1 Mar 2022 10:24:17 +0800
Subject: [PATCH 006/272] remove dot infershape (#39945)

---
 paddle/fluid/operators/dot_op.cc | 55 ++++++--------------------------
 1 file changed, 9 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index ed2b09796ee..a86a3bb3592 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/dot_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(X) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Y) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
-                      platform::errors::PreconditionNotMet(
-                          "Output(Out) of DotOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = static_cast<size_t>(x_dims.size());
-    PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The dimensions of input tensor X (%s) "
-                          "should be 1 or 2",
-                          x_dims.to_str()));
-
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        true, x_rank == (size_t)y_dims.size(),
-        platform::errors::PreconditionNotMet(
-            "ShapeError: The shape of input tensor Y: %s should match with "
-            "input tenosr X: %s",
-            y_dims.to_str(), x_dims.to_str()));
-    bool shape_match = true;
-    for (size_t i = 0; i < x_rank; ++i) {
-      if (x_dims[i] != y_dims[i]) {
-        shape_match = false;
-        break;
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(true, shape_match,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The shape of input tensor X: %s should "
-                          "be exactly the same "
-                          "with input tensor Y: %s",
-                          x_dims.to_str(), y_dims.to_str()));
-    auto dims = vectorize(x_dims);
-    dims[dims.size() - 1] = 1;
-    ctx->SetOutputDim("Out", phi::make_ddim(dims));
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PT_INFER_META(phi::DotInferMeta));
+
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                   ops::DotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::DotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::DotOpGradMaker<paddle::imperative::OpBase>,
+                  DotInferShapeFunctor);
 
 REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
 
-- 
GitLab


From 657dd5a97de6b54e59aa60a7d7afcab33bf36420 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Tue, 1 Mar 2022 10:48:13 +0800
Subject: [PATCH 007/272] Optimize group_norm op forward (#39596)

* optimize group norm forward

* use vectorized optimization

* add scalar calculation code

* optimize code
---
 paddle/fluid/operators/group_norm_op.cu | 149 ++++++++++++++++++++----
 1 file changed, 129 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 72a90d17998..b376334f1e9 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -29,6 +29,7 @@ namespace operators {
 
 using DataLayout = framework::DataLayout;
 enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+#define ALIGN_BYTES 16
 
 #define CHECK_CASE(i, flags, kernel_name, ...)                              \
   if (i == flags) {                                                         \
@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
 template <typename T>
 __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
                                               int imsize, int groups,
-                                              int group_size, T* mean, T* var,
-                                              const DataLayout data_layout) {
+                                              int group_size, T* mean, T* var) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   T x_mean = 0, x_var = 0;
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid];
+
     x_mean += val;
     x_var += val * val;
   }
@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
+template <typename T, typename AccT, int VecSize>
+__device__ __forceinline__ void ThreadReduce(const T* input, int size,
+                                             const int offset, AccT* mean,
+                                             AccT* var) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  if (offset > 0) {
+    input -= offset;
+    size += offset;
+    if (tid >= offset) {
+      AccT temp = input[tid];
+      *mean += temp;
+      *var += temp * temp;
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      AccT temp = ins[i];
+      *mean += temp;
+      *var += temp * temp;
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    AccT temp = input[tid];
+    *mean += temp;
+    *var += temp * temp;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
+  int i = blockIdx.x;
+  T x_mean = 0, x_var = 0;
+  for (int j = threadIdx.x; j < size; j += blockDim.x) {
+    T val;
+    val = x[i * size + j];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= size;
+  x_var /= size;
+  CudaAtomicAddWithWarp(&mean[i], x_mean);
+  CudaAtomicAddWithWarp(&var[i], x_var);
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
+                                            int size) {
+  int i = blockIdx.x;
+  AccT x_mean = static_cast<AccT>(0);
+  AccT x_var = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * size;
+  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_mean, kps::AddFunctor<AccT>());
+  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_var, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[i] = static_cast<T>(x_mean / size);
+    var[i] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T, int flags>
 __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
                                  const T* scale, const T* bias, int N, int C,
@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
   int H = imsize / W;
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
+  auto ng = bid * groups + gid;
+  T x_mean = mean[ng];
+  T x_var = var[ng];
   x_var = x_var - x_mean * x_mean;
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  T var_inv = rsqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) {
+    real_var[ng] = x_var;
+  }
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val;
     int hid, wid;
+    int index = (bid * C + ccid) * imsize + imid;
     if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
+      val = x[index];
     } else {
       hid = imid / W;
       wid = imid % W;
       val = x[(bid * H + hid) * W * C + wid * C + ccid];
     }
     val = (val - x_mean) * var_inv;
-    if (flags & kHasScale) val *= scale[gid * group_size + cid];
-    if (flags & kHasBias) val += bias[gid * group_size + cid];
+    if (flags & kHasScale) {
+      val *= scale[ccid];
+    }
+    if (flags & kHasBias) {
+      val += bias[ccid];
+    }
     if (data_layout == DataLayout::kNCHW) {
-      y[(bid * C + ccid) * imsize + imid] = val;
+      y[index] = val;
     } else {
       y[(bid * H + hid) * W * C + wid * C + ccid] = val;
     }
@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
         imsize *= x_dims[i];
       }
     }
+
 #ifdef __HIPCC__
     int block_size = std::max(std::min(256, imsize), 64);
 #else
     int block_size = std::min(1024, imsize);
 #endif
+
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
-    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
-        temp_var_data, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      int size = group_size * imsize;
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(size / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 grids(x_dims[0] * groups);
+      dim3 blocks(block_size_nchw);
+      if (size < vec_size) {
+        ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      } else {
+        VectorizedGetMeanAndVarNCHW<
+            T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      }
+    } else {
+      GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+          x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
+          temp_var_data);
+    }
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
     UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
-- 
GitLab


From 4da841e0caeb36b758039b4afa8758dd91d6252c Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Tue, 1 Mar 2022 10:53:16 +0800
Subject: [PATCH 008/272] [DP] Construct reducer group (#39987)

* add reducer
---
 .../distributed/collective/CMakeLists.txt     |   1 +
 .../fluid/distributed/collective/reducer.cc   | 131 ++++++++++++++
 paddle/fluid/distributed/collective/reducer.h |  32 ++++
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 paddle/fluid/pybind/distributed_py.cc         |  14 ++
 python/paddle/fluid/dygraph/parallel.py       |   8 +-
 .../tests/unittests/test_imperative_group.py  | 168 ++++++++----------
 7 files changed, 265 insertions(+), 91 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/reducer.cc
 create mode 100644 paddle/fluid/distributed/collective/reducer.h

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 41652f8b6ed..a5b40f8aa07 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,4 +1,5 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
new file mode 100644
index 00000000000..59f3ea3b0a7
--- /dev/null
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/reducer.h"
+#include "paddle/phi/common/data_type.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor> tensors,
+    const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), is_sparse_gradient.size(),
+      platform::errors::PreconditionNotMet(
+          "tensors len must be equal to is_sparse_gradient len, but "
+          "[%lu] != [%lu]",
+          tensors.size(), is_sparse_gradient.size()));
+  auto check_perm = [](const std::vector<int64_t> &x) -> bool {
+    size_t len = x.size();
+    std::vector<size_t> cnt(len, 0);
+    for (size_t i = 0; i < len; ++i) {
+      if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
+        return false;
+      }
+      cnt[x[i]]++;
+    }
+    return true;
+  };
+
+  PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
+                    platform::errors::PreconditionNotMet(
+                        "tensor_indices must be a permutation from 0 to %lu",
+                        tensor_indices.size()));
+  // the return vector
+  std::vector<std::vector<size_t>> res;
+
+  // Key: the var type
+  // Value: should use which index in group_size_limits for group size limit
+  std::map<experimental::DataType, size_t> group_limit_index;
+
+  // Key: the var type
+  // Value: <the var index in input tensors, total numel in this group>
+  std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
+      next_group;
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const auto &var = tensors[i];
+
+    size_t tensor_real_index = i;
+    if (!tensor_indices.empty()) {
+      tensor_real_index = tensor_indices[i];
+    }
+
+    if (is_sparse_gradient[tensor_real_index]) {
+      // we keep sparse var a single group
+      res.push_back({tensor_real_index});
+      continue;
+    }
+
+    const auto &var_dtype = var.dtype();
+    VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
+    auto &group_info = next_group[var_dtype];
+
+    int64_t var_size = -1;
+
+    if (var.is_dense_tensor()) {
+      var_size =
+          std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
+    } else {
+      VLOG(3) << "var " << var.name()
+              << " is not tensor or selected_rows, so skip it";
+      continue;
+    }
+
+    group_info.first.push_back(tensor_real_index);
+    group_info.second += experimental::SizeOf(var_dtype) * var_size;
+    // group_info.second += framework::SizeOfType(var_dtype) * var_size;
+
+    if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
+      // means it is the first var of var_dtype
+      group_limit_index[var_dtype] = 0;
+    }
+    auto &cur_limit_index = group_limit_index[var_dtype];
+    if (group_info.second >= group_size_limits[cur_limit_index]) {
+      // exceed group capacity and create a new group
+      res.emplace_back(std::move(group_info.first));
+      group_info = std::pair<std::vector<size_t>, size_t>();
+      cur_limit_index =
+          (std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
+    }
+  }
+
+  // add the final groups
+  for (auto &e : next_group) {
+    auto &group_info = e.second;
+    if (!group_info.first.empty()) {
+      res.emplace_back(std::move(group_info.first));
+    }
+  }
+
+  for (const auto &group_index : res) {
+    PADDLE_ENFORCE_NE(
+        group_index.empty(), true,
+        platform::errors::PreconditionNotMet(
+            "AssignGroupBySize construct empty group, please check."));
+  }
+  if (tensor_indices.empty()) {
+    std::sort(res.begin(), res.end(),
+              [](const std::vector<size_t> &x, const std::vector<size_t> &y) {
+                return x.front() < y.front();
+              });
+  }
+  return res;
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
new file mode 100644
index 00000000000..f8c75385ef8
--- /dev/null
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
+namespace paddle {
+namespace distributed {
+using Tensor = paddle::experimental::Tensor;
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
+    const std::vector<size_t>& group_size_limits,
+    const std::vector<int64_t>& tensor_indices = {});
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 1f06eda8a2e..c61e8212b02 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -81,7 +81,7 @@ set(PYBIND_SRCS
   cuda_streams_py.cc)
 
 if(NOT ON_INFER)
-  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
+  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
   endif()
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index e057fb53cce..7b59188a9f3 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) {
                     [](distributed::ProcessGroupStrategy &self, int nrings) {
                       self.nrings_ = nrings;
                     });
+
+  m->def("eager_assign_group_by_size",
+         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+            std::vector<size_t> group_size_limits,
+            std::vector<int64_t> tensor_indices) {
+           auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+           return distributed::Eager_AssignGroupBySize(
+               tensors, is_sparse_gradient, group_size_limits, tensor_indices);
+         },
+         py::arg("tensors"), py::arg("is_sparse_gradient"),
+         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
+         py::arg("tensor_indices") = std::vector<int64_t>{},
+         py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index ddb86848f84..0049f387b70 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -560,13 +560,19 @@ class DataParallel(layers.Layer):
                  strategy=None,
                  comm_buffer_size=25,
                  last_comm_buffer_size=1,
-                 find_unused_parameters=False):
+                 find_unused_parameters=False,
+                 process_group=None,
+                 gradient_as_buffer_view=False,
+                 static_graph=False):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
         self.find_unused_parameters = find_unused_parameters
         self.grad_need_sync = True
+        self.process_group = process_group
+        self.gradient_as_buffer_view = gradient_as_buffer_view
+        self.static_graph = static_graph
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py
index f9635809651..89535797ed0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_group.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py
@@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph
 from paddle.fluid.dygraph.nn import Linear
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-
-
-class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MLP, self).__init__()
-
-        self._linear1 = Linear(784, 10)
-        self._linear2 = Linear(10, 10)
-
-    def forward(self, inputs):
-        y = self._linear1(inputs)
-        y = self._linear2(y)
-        return y
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDataParallelGroup(unittest.TestCase):
-    def create_varbase(self, dtype, shape,
-                       type=core.VarDesc.VarType.LOD_TENSOR):
-        return core.VarBase(dtype, shape, "", type, True)
+    def create_varbase(self, dtype, shape):
+        return paddle.rand(shape=shape, dtype=dtype)
+
+    def assign_group_by_size(self, *args):
+        return core.assign_group_by_size(*args)
 
     def test_construct_group0(self):
         # one dtype & one limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 100]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400])
         self.assertEqual([[0], [1], [2], [3]], res)
 
     def test_construct_group1(self):
         # multi dtype & one limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [400])
         self.assertEqual([[0, 2], [1, 3], [4], [5]], res)
 
     def test_construct_group2(self):
         # one dtype & multi limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400, 800])
         self.assertEqual([[0], [1, 2], [3]], res)
 
     def test_construct_group3(self):
         # multi dtype & multi limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [200, 400])
         self.assertEqual([[0], [1], [2, 4], [3, 5]], res)
 
     def test_construct_group4(self):
         # multi dtype & zero limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [0])
         self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
 
     def test_construct_group5(self):
         # multi dtype & infinite capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [10000])
         self.assertEqual([[0, 2, 4], [1, 3, 5]], res)
 
     def test_construct_group6(self):
         # multi dtype & limit capability & multi tensor type
         var_list = []
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase(
+            "float32",
+            [1, 50], ))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [True, False, False, False, False, True], [400])
         self.assertEqual([[0], [1, 3], [2, 4], [5]], res)
 
     def test_construct_group7(self):
         # multi dtype & multi limit capability & multi tensor type
         var_list = []
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [True, False, False, False, False, True], [200, 400])
         self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
 
     def test_construct_group8(self):
         # one dtype & one limit capability & have tensor_indices
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 100]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400], [3, 0, 1, 2])
         self.assertEqual([[3, 0], [1], [2]], res)
 
     def test_construct_group9(self):
         # one dtype & one limit capability & have tensor_indices
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000]))
-        res = core.assign_group_by_size(var_list, [False, False, False, True],
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 1000]))
+        res = self.assign_group_by_size(var_list, [False, False, False, True],
                                         [300], [1, 0, 2, 3])
         self.assertEqual([[1, 0], [3], [2]], res)
 
 
+class TestDataParallelGroupEager(TestDataParallelGroup):
+    def create_varbase(self, dtype, shape):
+        with _test_eager_guard():
+            return paddle.rand(shape=shape, dtype=dtype)
+
+    def assign_group_by_size(self, *args):
+        return core.eager_assign_group_by_size(*args)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 8c2379732257f6d6bdf8fbe9157afea51a364942 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 1 Mar 2022 10:59:51 +0800
Subject: [PATCH 009/272] [Phi] Migrate logical_and/or/not/xor into Phi
 (#39942)

* [Phi] Migrate logical_and/or/not/xor into Phi

* fix unittest

* fix function name
---
 .../operators/controlflow/CMakeLists.txt      |   2 +-
 .../fluid/operators/controlflow/logical_op.cc |  10 +-
 .../fluid/operators/controlflow/logical_op.cu |  69 -----------
 .../fluid/operators/controlflow/logical_op.h  | 111 ------------------
 .../operators/controlflow/logical_op_npu.cc   |   2 +-
 paddle/phi/kernels/cpu/logical_kernel.cc      |  72 ++++++++++++
 paddle/phi/kernels/funcs/logical_functor.h    |  41 +++++++
 paddle/phi/kernels/gpu/logical_kernel.cu      |  79 +++++++++++++
 paddle/phi/kernels/logical_kernel.h           |  38 ++++++
 .../fluid/tests/unittests/test_diff_op.py     |   2 +-
 10 files changed, 234 insertions(+), 192 deletions(-)
 delete mode 100644 paddle/fluid/operators/controlflow/logical_op.cu
 delete mode 100644 paddle/fluid/operators/controlflow/logical_op.h
 create mode 100644 paddle/phi/kernels/cpu/logical_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/logical_functor.h
 create mode 100644 paddle/phi/kernels/gpu/logical_kernel.cu
 create mode 100644 paddle/phi/kernels/logical_kernel.h

diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 1a2df2a0c7b..a974f2ec335 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -20,5 +20,5 @@ else()
 endif()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
 file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index a4262d40543..4d11cb5ff74 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/logical_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
       ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
-                               paddle::operators::LogicalAndFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
-                               paddle::operators::LogicalOrFunctor);
 REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
-                              paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
                            "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
deleted file mode 100644
index d88658607ed..00000000000
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/logical_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using InT = typename Functor::ELEMENT_TYPE;
-    using OutT = bool;
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-
-    if (ins.size() == 1) {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_name,                                                                 \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>,    \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>,  \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>,     \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
-
-REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
-#undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
deleted file mode 100644
index 15cd643a858..00000000000
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
-  template <typename T>                                      \
-  struct func_name {                                         \
-    using ELEMENT_TYPE = T;                                  \
-    HOSTDEVICE bool operator()(const T a, const T b) const { \
-      return static_cast<bool>(a) op static_cast<bool>(b);   \
-    }                                                        \
-  };
-
-LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
-LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
-LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
-#undef LOGICAL_BINARY_FUNCTOR
-
-template <typename T>
-struct LogicalNotFunctor {
-  using ELEMENT_TYPE = T;
-  HOSTDEVICE bool operator()(const T a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
-                                                          binary_func, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor unary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(),
-          out->mutable_data<bool>(context.GetPlace()), unary_func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor)              \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<                 \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
-
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor)               \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<                  \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index 02f95254035..c3d7df8d027 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
new file mode 100644
index 00000000000..3d179e1e75f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                         \
+  template <typename T, typename Context>                          \
+  void Logical##type##Kernel(const Context& dev_ctx,               \
+                             const DenseTensor& x,                 \
+                             const DenseTensor& y,                 \
+                             DenseTensor* out) {                   \
+    funcs::Logical##type##Functor<T> binary_func;                  \
+    ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
+        dev_ctx, x, y, -1, binary_func, out);                      \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  auto* out_ptr = dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                           \
+                     CPU,                                   \
+                     ALL_LAYOUT,                            \
+                     phi::Logical##func_type##Kernel,       \
+                     float,                                 \
+                     double,                                \
+                     bool,                                  \
+                     int64_t,                               \
+                     int,                                   \
+                     int8_t,                                \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
diff --git a/paddle/phi/kernels/funcs/logical_functor.h b/paddle/phi/kernels/funcs/logical_functor.h
new file mode 100644
index 00000000000..1ea7fc43e6b
--- /dev/null
+++ b/paddle/phi/kernels/funcs/logical_functor.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
+  template <typename T>                                      \
+  struct func_name {                                         \
+    using ELEMENT_TYPE = T;                                  \
+    HOSTDEVICE bool operator()(const T a, const T b) const { \
+      return static_cast<bool>(a) op static_cast<bool>(b);   \
+    }                                                        \
+  };
+
+LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T a) const { return !a; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu
new file mode 100644
index 00000000000..f32d4c77d40
--- /dev/null
+++ b/paddle/phi/kernels/gpu/logical_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                               \
+  template <typename T, typename Context>                                \
+  void Logical##type##Kernel(const Context& dev_ctx,                     \
+                             const DenseTensor& x,                       \
+                             const DenseTensor& y,                       \
+                             DenseTensor* out) {                         \
+    using InT = typename funcs::Logical##type##Functor<T>::ELEMENT_TYPE; \
+    using OutT = bool;                                                   \
+    dev_ctx.template Alloc<bool>(out);                                   \
+    funcs::Logical##type##Functor<T> binary_func;                        \
+    std::vector<const DenseTensor*> ins = {&x, &y};                      \
+    std::vector<DenseTensor*> outs = {out};                              \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, InT, OutT>(         \
+        dev_ctx, ins, &outs, -1, binary_func);                           \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  using InT = typename funcs::LogicalNotFunctor<T>::ELEMENT_TYPE;
+  using OutT = bool;
+
+  dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BroadcastKernel<ElementwiseType::kUnary, InT, OutT>(
+      dev_ctx, ins, &outs, -1, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                            \
+                     GPU,                                    \
+                     ALL_LAYOUT,                             \
+                     phi::Logical##func_type##Kernel,        \
+                     float,                                  \
+                     double,                                 \
+                     bool,                                   \
+                     int64_t,                                \
+                     int,                                    \
+                     int8_t,                                 \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor)
diff --git a/paddle/phi/kernels/logical_kernel.h b/paddle/phi/kernels/logical_kernel.h
new file mode 100644
index 00000000000..3ccc03a5b59
--- /dev/null
+++ b/paddle/phi/kernels/logical_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DECLEAR_LOGICAL_BINARY_KERNEL(type)          \
+  template <typename T, typename Context>            \
+  void Logical##type##Kernel(const Context& dev_ctx, \
+                             const DenseTensor& x,   \
+                             const DenseTensor& y,   \
+                             DenseTensor* out);
+
+DECLEAR_LOGICAL_BINARY_KERNEL(And)
+DECLEAR_LOGICAL_BINARY_KERNEL(Or)
+DECLEAR_LOGICAL_BINARY_KERNEL(Xor)
+#undef DECLEAR_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 345dad54132..1ae780f488d 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase):
 
     def test_dygraph(self):
         for place in self.places:
-            paddle.disable_static(place)
+            paddle.disable_static()
             x = paddle.to_tensor(self.input, place=place)
             if self.prepend is not None:
                 self.prepend = paddle.to_tensor(self.prepend, place=place)
-- 
GitLab


From e8d4558366d1dbf81f341eac5bbdb712eeb1ba0d Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 1 Mar 2022 11:13:32 +0800
Subject: [PATCH 010/272] [PHI] Support Multi Input and Output for  InferShape
 (#39870)

* add multi input for infer_shape

* support multi output for infershape

* fix split bug

* fix bug of concat

* support vector<MetaTensor*> in infrt

* fix bug
---
 paddle/fluid/framework/infershape_utils.cc | 69 +++++++++++-------
 paddle/fluid/operators/concat_op.cc        | 44 +++---------
 paddle/fluid/operators/split_op.cc         | 55 +++------------
 paddle/infrt/host_context/value.h          |  2 +-
 paddle/phi/api/lib/api_custom_impl.cc      |  6 +-
 paddle/phi/core/infermeta_utils.cc         | 16 ++---
 paddle/phi/core/infermeta_utils.h          | 15 ++--
 paddle/phi/infermeta/multiary.cc           | 23 ++++--
 paddle/phi/infermeta/multiary.h            |  2 +-
 paddle/phi/infermeta/unary.cc              | 82 ++++++++++++----------
 paddle/phi/infermeta/unary.h               |  2 +-
 paddle/phi/kernels/concat_kernel.h         |  5 +-
 paddle/phi/kernels/cpu/concat_kernel.cc    |  4 +-
 paddle/phi/kernels/cpu/split_kernel.cc     | 14 ----
 paddle/phi/kernels/gpu/split_kernel.cu     | 14 ----
 paddle/phi/kernels/split_kernel.h          | 12 ++--
 python/paddle/utils/code_gen/api_base.py   | 19 ++++-
 17 files changed, 175 insertions(+), 209 deletions(-)

diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index e14b91d935d..d9287b9a624 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   // TODO(chenweihang): support multiple inputs and outputs later
   phi::InferMetaContext infer_mete_context;
   for (auto& in_name : input_names) {
-    if (ctx->HasInput(in_name)) {
-      infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
-          ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
+    if (ctx->HasInputs(in_name)) {
+      auto input_var = ctx->GetInputVarPtrs(in_name);
+      if (input_var.size() == 1) {
+        infer_meta_context.EmplaceBackInput(
+            std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
+        inputs.reserve(input_var.size());
+        for (const auto& in : input_var) {
+          inputs.push_back(
+              std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackInputs(std::move(inputs));
+      }
     } else {
       infer_meta_context.EmplaceBackInput({nullptr});
     }
   }
 
-  for (auto& out_name : output_names) {
-    if (ctx->HasOutput(out_name)) {
-      infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
-          ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
-    } else {
-      infer_meta_context.EmplaceBackOutput({nullptr});
-    }
-  }
   auto attr_reader = ctx->Attrs();
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto attr_name = attr_names[i];
@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
           }
         } else {
           // If is not in runtime, we will set default value(-1) for ScalarArray
-          int64_t num_ele = 0;
           std::vector<VarDesc*> vars;
           vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); i++) {
+          for (size_t i = 0; i < infershape_inputs.size(); ++i) {
             vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
           }
 
+          int64_t num_ele = 0;
           if (vars.size() == 1) {
             num_ele = 1;
             const auto& tensor_dims = vars[0]->GetShape();
@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               num_ele *= tensor_dims[i];
             }
           } else {
-            for (auto& var : vars) {
-              const auto& tensor_dims = var->GetShape();
-              PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
-                                platform::errors::InvalidArgument(
-                                    "The shape is constructed by multi-tensor, "
-                                    "every tensor's dims should be 1. But your "
-                                    "shape has tensor that dims is %s.",
-                                    tensor_dims.size()));
-              num_ele += tensor_dims[0];
-            }
+            num_ele = vars.size();
           }
           phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
           tensor_attr.SetFromTensor(true);
@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             std::type_index(typeid(std::vector<int32_t>))) {
           infer_meta_context.EmplaceBackAttr(std::move(
               phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported cast op attribute `%s` to ScalarArray when "
-              "construct KernelContext.",
+              "construct InferMetaContext.",
               attr_name));
         }
       }
@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
         }
       } else if (ctx->HasInput(attr_name)) {
         const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
-
         if (infershape_input.size() == 1) {
           if (ctx->IsRuntime()) {
             Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
+    } else {
+      // do nothing
+    }
+  }
+
+  for (auto& out_name : output_names) {
+    if (ctx->HasOutputs(out_name)) {
+      auto output_var = ctx->GetOutputVarPtrs(out_name);
+      if (output_var.size() == 1) {
+        infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
+            output_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
+        outputs.reserve(output_var.size());
+        for (const auto& out : output_var) {
+          outputs.emplace_back(
+              std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackOutputs(std::move(outputs));
+      }
+    } else {
+      infer_meta_context.EmplaceBackOutput({nullptr});
     }
   }
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 55de4087f57..1da7798ea26 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 
+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(0),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in concat op should > 0. But "
-            "received inputs' length is 0."));
-    if (inputs_num == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims =
-          phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      size_t axis =
-          ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
-                      static_cast<int64_t>(inputs_dims[0].size()));
-      framework::DDim out_dims =
-          phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
-      if (out_dims[axis] < 0) {
-        out_dims[axis] = -1;
-      }
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PT_INFER_META(phi::ConcatInferMeta));
+
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                   ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
+                  ConcatInferShapeFunctor);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
                   ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index a8f05d94563..6678320f9ff 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of SplitOp should not be null."));
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "Outputs(Out) of SplitOp should not be empty."));
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
-    const size_t outs_number = outs_names.size();
-
-    if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          sections.size(), outs_number,
-          platform::errors::InvalidArgument("tensor split sections size "
-                                            "should be equal to output size."));
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
-      std::vector<framework::DDim> outs_dims(outs_number, out_dims);
-      ctx->SetOutputsDim("Out", outs_dims);
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-      return;
-    }
-
-    bool each_section_is_known =
-        (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
-
-    auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
-                                    in_dims, num, sections, axis, outs_number);
-    ctx->SetOutputsDim("Out", outs_dims);
-    if (axis != 0) {
-      // Only pass LoD when not spliting along the first dim.
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -168,6 +125,10 @@ Example:
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
+                            PT_INFER_META(phi::SplitInferMeta));
+
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
                   ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>);
+                  ops::SplitGradMaker<paddle::imperative::OpBase>,
+                  SplitInferShapeFunctor);
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index eb9a2092657..7e7d77d3af7 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -73,7 +73,7 @@ using ValueVariantType =
             std::vector<phi::DenseTensor>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
-            std::vector<phi::MetaTensor>,
+            std::vector<phi::MetaTensor*>,
             phi::MetaConfig,
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index c7400b93fcd..19b113838ea 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
   std::vector<Tensor> out;
   auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
   std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_number);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_number);
   for (size_t i = 0; i < out_number; ++i) {
     meta_outs.push_back(dense_outs[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
   }
 
   phi::SplitInferMeta(
-      MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs);
+      MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index f3dd056911e..671ba2ec7dc 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
                : paddle::optional<const phi::MetaTensor&>{paddle::none};
 }
 
-std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
-                                                        size_t end) const {
-  std::vector<MetaTensor> result;
+std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
+                                                         size_t end) const {
+  std::vector<MetaTensor*> result;
   result.reserve(end - start);
 
   for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*inputs_.at(i));
+    result.push_back(inputs_.at(i).get());
   }
 
   return result;
@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
   return outputs_.at(idx).get();
 }
 
-std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start,
-                                                               size_t end) {
-  std::vector<MetaTensor> result;
+std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
+                                                                size_t end) {
+  std::vector<MetaTensor*> result;
   result.reserve(end - start);
   for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*outputs_.at(i));
+    result.emplace_back(outputs_.at(i).get());
   }
   return result;
 }
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 203dbb26984..a5775db7438 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -50,13 +50,13 @@ class InferMetaContext {
   const std::pair<int, int>& OutputRangeAt(size_t idx) const;
 
   const MetaConfig& GetMetaConfig() const;
-  const MetaTensor& InputAt(size_t idx) const;
 
+  const MetaTensor& InputAt(size_t idx) const;
   paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
+  std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;
 
-  std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
   MetaTensor* MutableOutputAt(size_t idx);
-  std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
+  std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);
 
   template <typename AttrType>
   AttrType AttrAt(size_t idx) {
@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   template <typename... Tail>
-  struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
+  struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
       static_assert(out_idx == 0,
                     "InferMeta's Input should appear before Outputs.");
       const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      std::vector<MetaTensor> arg =
+      std::vector<MetaTensor*> arg =
           ctx->InputsBetween(range.first, range.second);
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   template <typename... Tail>
-  struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> {
+  struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
       const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-      std::vector<MetaTensor> tmp =
+      std::vector<MetaTensor*> arg =
           ctx->MutableOutputBetween(range.first, range.second);
-      std::vector<MetaTensor>* arg = &tmp;
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
                                                                  pargs...,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7a0db3d5c17..8857c2cf424 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config) {
@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
                     phi::errors::InvalidArgument(
                         "The size of input meta vector should be greater"
                         "than 0."));
+  if (axis_scalar.FromTensor()) {
+    auto out_dims =
+        phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
+    out->set_dims(out_dims);
+    out->set_dtype(x.at(0)->dtype());
+    out->set_layout(x.at(0)->layout());
+    out->share_lod(*x.at(0));
+    return;
+  }
 
   int axis = axis_scalar.to<int>();
   // 1. calculate axis
-  int rank = x.at(0).dims().size();
+  int rank = x.at(0)->dims().size();
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
@@ -111,15 +120,17 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
 
   // 2. calculate out dims
   std::vector<phi::DDim> x_dims;
-  for (auto& x_t : x) {
-    x_dims.push_back(x_t.dims());
+  x_dims.reserve(x.size());
+  for (const auto* x_t : x) {
+    x_dims.emplace_back(x_t->dims());
   }
   phi::DDim out_dim =
       phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
 
   out->set_dims(out_dim);
-  out->set_dtype(x.at(0).dtype());
-  out->set_layout(x.at(0).layout());
+  out->set_dtype(x.at(0)->dtype());
+  out->set_layout(x.at(0)->layout());
+  out->share_lod(*x.at(0));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index a5fb2a4cbdd..473845c6e40 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -25,7 +25,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 49fd0a343a4..4696187bd23 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x,
                     const ScalarArray& num_or_sections,
                     const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                     MetaConfig config) {
+  if (!config.is_runtime) {
+    if (axis.FromTensor() || num_or_sections.FromTensor()) {
+      auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
+      for (auto* item : out) {
+        item->set_dims(out_dims);
+        item->share_lod(x);
+      }
+      return;
+    }
+  }
+
   int axis_value = axis.to<int>();
   int rank = x.dims().size();
   PADDLE_ENFORCE_EQ(
@@ -475,27 +486,34 @@ void SplitInferMeta(const MetaTensor& x,
     axis_value = axis_value + rank;
   }
 
+  std::vector<phi::DDim> out_dims(out.size(), x.dims());
+
   auto input_axis_dim = x.dims().at(axis_value);
   auto num_or_sections_data = num_or_sections.GetData();
-  // step1: get formated sections
-  std::vector<int64_t> sections;
   // num_or_sections is a number
   if (num_or_sections_data.size() == 1) {
-    int num = num_or_sections_data.at(0);
+    if (config.is_runtime || input_axis_dim > 0) {
+      int num = num_or_sections_data.at(0);
+      PADDLE_ENFORCE_EQ(
+          input_axis_dim % num,
+          0,
+          phi::errors::InvalidArgument(
+              "The input's size along the split dimension "
+              "must be evenly divisible by Attr(num_or_sections). "
+              "But received Attr(num_or_sections) "
+              "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
+              num,
+              x.dims(),
+              axis_value));
 
-    PADDLE_ENFORCE_EQ(input_axis_dim % num,
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The input's size along the split dimension "
-                          "must be evenly divisible by Attr(num_or_sections). "
-                          "But received Attr(num_or_sections) "
-                          "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
-                          num,
-                          x.dims(),
-                          axis_value));
-
-    for (int i = 0; i < num; ++i) {
-      sections.push_back(input_axis_dim / num);
+      size_t out_axis_dim = input_axis_dim / num;
+      for (auto& out_dim : out_dims) {
+        out_dim[axis_value] = out_axis_dim;
+      }
+    } else {
+      for (auto& out_dim : out_dims) {
+        out_dim[axis_value] = -1;
+      }
     }
   } else {
     // num_or_sections is a sections
@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x,
     int unknow_dim_idx = -1;
     int num_of_unknow = 0;
     int sum_of_section = 0;
+    std::vector<int64_t> sections = num_or_sections_data;
 
     for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
-      sections.push_back(num_or_sections_data[i]);
-
       if (num_or_sections_data[i] == unknow_dim_val) {
         num_of_unknow++;
         unknow_dim_idx = i;
@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x,
               x.dims(),
               axis_value));
     }
-  }
-
-  // setp2: fill out dims
-  std::vector<phi::DDim> out_dims(sections.size(), x.dims());
-  if (config.is_runtime || input_axis_dim > 0) {
-    for (size_t i = 0; i < sections.size(); ++i) {
+    for (size_t i = 0; i < out_dims.size(); ++i) {
       out_dims[i][axis_value] = sections[i];
     }
-  } else {
-    for (size_t i = 0; i < sections.size(); ++i) {
-      out_dims[i][axis_value] = -1;
-    }
   }
 
-  for (size_t i = 0; i < sections.size(); ++i) {
+  for (size_t i = 0; i < out.size(); ++i) {
     if (axis_value != 0) {
       // Only pass LoD when not spliting along the first dim.
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
+      out.at(i)->set_dtype(x.dtype());
+      out.at(i)->set_dims(out_dims[i]);
+      out.at(i)->set_layout(x.layout());
     } else {
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
-      (*out)[i].share_lod(x);
+      out.at(i)->set_dtype(x.dtype());
+      out.at(i)->set_dims(out_dims[i]);
+      out.at(i)->set_layout(x.layout());
+      out.at(i)->share_lod(x);
     }
   }
 }
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 4fab1ec68ec..b3929b9d2b4 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x_meta,
                     const ScalarArray& num_or_sections,
                     const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                     MetaConfig config = MetaConfig());
 
 void UnbindInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index fbc4a86f5af..f1366788146 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx,
                    const std::vector<DenseTensor>& x,
                    const Scalar& axis) {
   std::vector<MetaTensor> meta_x;
+  meta_x.reserve(x.size());
+  std::vector<MetaTensor*> meta_x_ptr;
   for (const auto& t : x) {
     meta_x.emplace_back(t);
+    meta_x_ptr.push_back(&meta_x.back());
   }
 
   auto dense_out = phi::Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 18bb8837b10..5c4202837c4 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx,
   axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
 
   std::vector<phi::DDim> x_dims;
+  x_dims.reserve(x.size());
   for (size_t i = 0; i < x.size(); ++i) {
     x_dims.push_back(x[i].dims());
   }
@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx,
     }
   } else {
     std::vector<phi::DenseTensor> inputs;
+    inputs.reserve(x.size());
     for (size_t j = 0; j < x.size(); ++j) {
       if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+        inputs.emplace_back(x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 722681fb7bc..4acf9b02028 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
-  // need to infershape output
-  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
-    std::vector<MetaTensor> out_metas;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      out_metas.push_back(outs[i]);
-    }
-
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
-
-    for (size_t i = 0; i < out_metas.size(); ++i) {
-      outs[i]->Resize(out_metas[i].dims());
-    }
-  }
-
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index a698b9e7161..d2473d5b0b1 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
-  // need to infershape output
-  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
-    std::vector<MetaTensor> out_metas;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      out_metas.push_back(outs[i]);
-    }
-
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
-
-    for (size_t i = 0; i < out_metas.size(); ++i) {
-      outs[i]->Resize(out_metas[i].dims());
-    }
-  }
-
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 1e730d809bc..840fe4366ce 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -43,18 +43,18 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
   }
 
   std::vector<MetaTensor> out_meta;
+  std::vector<MetaTensor*> out_meta_ptr;
   out_meta.reserve(out_number);
+  out_meta_ptr.reserve(out_number);
   std::vector<DenseTensor> result;
   result.reserve(out_number);
 
   for (size_t i = 0; i < out_number; ++i) {
-    auto dense_out = phi::Empty<T, Context>(dev_ctx);
-    MetaTensor tmp_meta(&dense_out);
-
-    result.push_back(dense_out);
-    out_meta.push_back(&result.back());
+    result.emplace_back(phi::Empty<T, Context>(dev_ctx));
+    out_meta.emplace_back(&result.back());
+    out_meta_ptr.push_back(&out_meta.back());
   }
-  SplitInferMeta(x, num_or_sections, axis, &out_meta);
+  SplitInferMeta(x, num_or_sections, axis, out_meta_ptr);
 
   std::vector<DenseTensor*> outs;
   outs.reserve(out_meta.size());
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 5fc9dfe3f64..cfd817c24c7 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         param_code = ""
         for param in infer_meta_params:
             if param in input_names:
-                if param in self.optional_vars:
+                if self.inputs['input_info'][param] == "const Tensor&":
+                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                elif self.inputs['input_info'][
+                        param] == "const std::vector<Tensor>&":
+                    meta_tensor_code = meta_tensor_code + f"""
+{code_indent}  auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param});
+{code_indent}  std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
+{code_indent}  for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
+{code_indent}    {param}_metas[i] = &{param}_meta_vec[i];
+{code_indent}  }}
+"""
+
+                    param_code = param_code + param + "_metas, "
+                elif param in self.optional_vars:
                     meta_tensor_code = meta_tensor_code + f"""
 {code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
 {code_indent}  auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
@@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
                     param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
                 else:
-                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                    raise ValueError(
+                        f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
+                    )
             elif param in kernel_output_names:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + param.replace(
                     'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
-- 
GitLab


From d49115946db8f9b0dc15986ee10b7209a702fa6e Mon Sep 17 00:00:00 2001
From: helen88 <z8hanghuan@126.com>
Date: Tue, 1 Mar 2022 11:21:22 +0800
Subject: [PATCH 011/272] optimize mergeadd for sparse_adam,*test=kunlun
 (#39966)

* optimize mergeadd for sparse_adam,*test=kunlun

* optimize mergeadd for sparse_adam,*test=kunlun

* optimize mergeadd for sparse_adam, *test=kunlun
---
 cmake/external/xpu.cmake                      |  2 +-
 .../operators/math/selected_rows_functor.cc   | 71 +++++++++----------
 2 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 415c0fe9bef..45a76fdc1f1 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index fcd5c06a6f3..5ac39953462 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/operators/mkldnn/axpy_handler.h"
@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
     out.mutable_value()->mutable_data<T>(
         phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
       rows_to_id[merge_rows[i]] = i;
     }
 
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
+    auto* y_data = out.mutable_value()->data<T>();
+    auto* x_data = input.value().data<T>();
+    int xm = input_rows.size();
+    int ym = merge_rows.size();
     int n = input_width;
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_to_id[input_rows[i]];
-      auto r = xpu::add(context.x_context(), &input_data[i * input_width],
-                        &out_data[out_i * input_width],
-                        &out_data[out_i * input_width], n);
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                     XPUAPIErrorMsg[r]));
-    }
+
+    xpu::ctx_guard RAII_GUARD(context.x_context());
+    int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+    int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+    memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                 merge_rows.data(), ym * sizeof(int64_t));
+    memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                 input_rows.data(), xm * sizeof(int64_t));
+    int r =
+        xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                        x_rows_data, y_rows_data, xm, n, ym);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
   }
 
   void operator()(const platform::XPUDeviceContext& context,
@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
             {static_cast<int64_t>(merged_row_set.size()), input_width}),
         context.GetPlace());
 
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-
-    float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+    float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
       }
       auto& input_rows = input->rows();
 
+      auto* x_data = input->value().data<T>();
+      int xm = input_rows.size();
+      int ym = merge_rows.size();
       int n = input_width;
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        auto r = xpu::add(
-            context.x_context(), input->value().data<T>() + i * input_width,
-            &out_data[out_i * input_width], &out_data[out_i * input_width], n);
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                       XPUAPIErrorMsg[r]));
-      }
+
+      xpu::ctx_guard RAII_GUARD(context.x_context());
+      int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+      int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+      memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                   merge_rows.data(), ym * sizeof(int64_t));
+      memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                   input_rows.data(), xm * sizeof(int64_t));
+      int r =
+          xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                          x_rows_data, y_rows_data, xm, n, ym);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
     }
   }
 };
-- 
GitLab


From 08b43cce6d2d5e2f57a4317461eb26f88af9bd3c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 1 Mar 2022 11:24:52 +0800
Subject: [PATCH 012/272] [Phi] Support kps backend and kernel registry
 (#39941)

* support kps backend and compile

* resolve conflict

* fix kps backend trans

* test in xpu2 device

* remove dummy kernel
---
 cmake/generic.cmake                     |  1 +
 cmake/phi.cmake                         | 60 +++++++++++++++++++++----
 paddle/fluid/framework/phi_utils.cc     |  4 ++
 paddle/phi/backends/gpu/gpu_context.h   |  8 ++++
 paddle/phi/backends/xpu/xpu_context.h   |  8 ++++
 paddle/phi/common/backend.h             |  8 ++++
 paddle/phi/core/compat/convert_utils.cc |  8 ++++
 paddle/phi/tests/common/test_backend.cc |  4 ++
 8 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f7c17bd7cfe..51ed537ce5d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME)
       else()
         xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
         find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (xpu_library_DEPS)
         add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index d9132b84455..f6e15758379 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./gpudnn\/")
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./kps\/")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -97,6 +99,7 @@ function(kernel_library TARGET)
     set(gpu_srcs)
     set(xpu_srcs)
     set(gpudnn_srcs)
+    set(kps_srcs)
     set(selected_rows_srcs)
     # parse and save the deps kerenl targets
     set(all_srcs)
@@ -128,6 +131,9 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+            endif()
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
                 list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
             endif()
@@ -137,6 +143,15 @@ function(kernel_library TARGET)
                 list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
             endif()
         endif()
+        if (WITH_XPU_KP)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                # Change XPU2 file suffix
+                # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
+                file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+                list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+            endif()
+        endif()
     else()
         # TODO(chenweihang): impl compile by source later
     endif()
@@ -150,6 +165,7 @@ function(kernel_library TARGET)
     list(APPEND all_srcs ${gpu_srcs})
     list(APPEND all_srcs ${xpu_srcs})
     list(APPEND all_srcs ${gpudnn_srcs})
+    list(APPEND all_srcs ${kps_srcs})
     foreach(src ${all_srcs})
         file(READ ${src} target_content)
         string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
@@ -159,11 +175,11 @@ function(kernel_library TARGET)
             string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
         endif()
         foreach(include_kernel ${include_kernels})
-        if ("${kernel_library_SUB_DIR}" STREQUAL "")
-            string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
-        else()
-            string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
-        endif()
+            if ("${kernel_library_SUB_DIR}" STREQUAL "")
+                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
+            else()
+                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
+            endif()
             string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
             list(APPEND kernel_deps ${kernel_name})
         endforeach()
@@ -176,11 +192,20 @@ function(kernel_library TARGET)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
     list(LENGTH gpudnn_srcs gpudnn_srcs_len)
+    list(LENGTH kps_srcs kps_srcs_len)
     list(LENGTH selected_rows_srcs selected_rows_srcs_len)
 
+    # kernel source file level
+    # level 1: base device kernel
+    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+    # level 2: device-independent kernel
+    # - common_srcs
+    # level 3: Kernel implemented by reusing device-independent kernel
+    # - selected_rows_srcs
+
     # Build Target according different src organization
     if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
+        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
         (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
         # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
         if (WITH_GPU)
@@ -193,6 +218,11 @@ function(kernel_library TARGET)
                 hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
+        elseif (WITH_XPU_KP)
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+                xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
+            endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                 cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -200,7 +230,7 @@ function(kernel_library TARGET)
             endif()
         endif()
     # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
         if (WITH_GPU)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
                 nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -209,6 +239,10 @@ function(kernel_library TARGET)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
                 hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
+        elseif (WITH_XPU_KP)
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+                xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                 cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -222,6 +256,9 @@ function(kernel_library TARGET)
         elseif (WITH_ROCM)
             hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
         else()
             cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
@@ -232,6 +269,8 @@ function(kernel_library TARGET)
             nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
             hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         else()
             cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
@@ -240,6 +279,8 @@ function(kernel_library TARGET)
             nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
             hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         else()
             cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
@@ -249,7 +290,7 @@ function(kernel_library TARGET)
 
     if (${target_build_flag} EQUAL 1)
         if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
+            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
             ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
             # append target into PHI_KERNELS property
             get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
@@ -275,6 +316,9 @@ function(kernel_library TARGET)
         if (${gpudnn_srcs_len} GREATER 0)
             kernel_declare(${gpudnn_srcs})
         endif()
+        if (${kps_srcs_len} GREATER 0)
+            kernel_declare(${kps_srcs})
+        endif()
         if (${selected_rows_srcs_len} GREATER 0)
             kernel_declare(${selected_rows_srcs})
         endif()
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 355291beb60..1a39a87fb99 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
     library_type = LibraryType::kMKLDNN;
   } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
     library_type = LibraryType::kCUDNN;
+  } else if (kernel_key.backend() == phi::Backend::KPS) {
+    library_type = LibraryType::kKP;
   } else {
     // do nothing
   }
@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
     backend = phi::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
     backend = phi::Backend::GPUDNN;
+  } else if (kernel_type.library_type_ == LibraryType::kKP) {
+    backend = phi::Backend::KPS;
   } else {
     // do
   }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 603ce0817c4..b9d843982dc 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext {
 // must use different function name for cudnn kernel
 using GPUDNNContext = GPUContext;
 
+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using KPSContext = GPUContext;
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index 3005d1707e6..b87489c567c 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
   std::unique_ptr<Impl> impl_;
 };
 
+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if PADDLE_WITH_XPU_KP
+using KPSContext = XPUContext;
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 4b7bf65be39..a9e12f5d81e 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
   MKLDNN,
   GPUDNN,  // cuDNN and hipDNN
 
+  // paddle kernel primitives backend
+  KPS,
+
   // end of backend types
   NUM_BACKENDS,
 
@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::GPUDNN:
       os << "GPUDNN";
       break;
+    case Backend::KPS:
+      os << "KPS";
+      break;
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::MKLDNN;
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
+  } else if (s == std::string("KPS")) {
+    return Backend::KPS;
   } else {
     return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                 phi::GetOrRegisterGlobalDeviceTypeId(s));
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 3b7a733ede9..b85db07bd9d 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::XPU:
       return phi::XPUPlace(
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
+#endif
+    case phi::Backend::KPS:
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      return phi::GPUPlace(
+          set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
+#elif defined(PADDLE_WITH_XPU_KP)
+      return phi::XPUPlace(
+          set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     default: {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index fa4ffc84bf5..5d6862c368c 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -44,6 +44,9 @@ TEST(Backend, OStream) {
   oss << phi::Backend::GPUDNN;
   EXPECT_EQ(oss.str(), "GPUDNN");
   oss.str("");
+  oss << phi::Backend::KPS;
+  EXPECT_EQ(oss.str(), "KPS");
+  oss.str("");
   try {
     oss << phi::Backend::NUM_BACKENDS;
   } catch (const std::exception& exception) {
@@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) {
   EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
   EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
   EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
+  EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS"));
   EXPECT_EQ(static_cast<phi::Backend>(
                 static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
             pexp::StringToBackend("CustomBackend"));
-- 
GitLab


From b34663876056740261a9f58cf3e5d90e9e49788f Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 1 Mar 2022 11:25:24 +0800
Subject: [PATCH 013/272] [phi] move uniform_random to phi (#39937)

* move uniform_random to phi

* fit selected_rows

* replace mutable_data
---
 paddle/fluid/framework/operator.cc            |   3 +
 paddle/fluid/operators/uniform_random_op.cc   |   4 -
 paddle/fluid/operators/uniform_random_op.cu   |   3 -
 .../phi/kernels/cpu/uniform_random_kernel.cc  | 115 ++++++++
 paddle/phi/kernels/funcs/aligned_vector.h     |  75 ++++++
 .../phi/kernels/funcs/distribution_helper.h   | 249 ++++++++++++++++++
 paddle/phi/kernels/funcs/index_impl.cu.h      |  93 +++++++
 .../phi/kernels/gpu/uniform_random_kernel.cu  | 163 ++++++++++++
 .../selected_rows/uniform_random_kernel.cc    |  88 +++++++
 paddle/phi/kernels/uniform_random_kernel.h    |  66 +++++
 paddle/phi/ops/compat/uniform_random_sig.cc   | 159 +++++++++++
 11 files changed, 1011 insertions(+), 7 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/uniform_random_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/aligned_vector.h
 create mode 100644 paddle/phi/kernels/funcs/distribution_helper.h
 create mode 100644 paddle/phi/kernels/funcs/index_impl.cu.h
 create mode 100644 paddle/phi/kernels/gpu/uniform_random_kernel.cu
 create mode 100644 paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
 create mode 100644 paddle/phi/kernels/uniform_random_kernel.h
 create mode 100644 paddle/phi/ops/compat/uniform_random_sig.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d33791f70c4..36208c41ed5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
     }
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done inputs";
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto it = ctx.outputs.find(output_names[i]);
@@ -2118,6 +2119,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
 
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done outputs";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
@@ -2226,6 +2228,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     }
   }
+  VLOG(4) << "Done attributes";
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 353d653f481..1c22e60fa87 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(
-    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
-    paddle::operators::CPUUniformRandomKernel<double>,
-    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     uniform_random_batch_size_like,
     paddle::operators::CPUUniformRandomKernel<float>,
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index fb38a6aded4..2ceb8a68d86 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
                         paddle::operators::GPUUniformRandomKernel<float>,
                         paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
new file mode 100644
index 00000000000..8ec1d9683e1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+inline void UniformRealDistribution(T *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<float> dist(min, max);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context &dev_ctx,
+                            const ScalarArray &shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor *out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  VLOG(4) << out->dims();
+  T *data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+  UniformRealDistribution<T>(data, size, min, max, engine);
+  if (diag_num > 0) {
+    PADDLE_ENFORCE_GT(
+        size,
+        (diag_num - 1) * (diag_step + 1),
+        phi::errors::InvalidArgument(
+            "ShapeInvalid: the diagonal's elements is equal (num-1) "
+            "* (step-1) with num %d, step %d,"
+            "It should be smaller than %d, but received %d",
+            diag_num,
+            diag_step,
+            (diag_num - 1) * (diag_step + 1),
+            size));
+    for (int64_t i = 0; i < diag_num; ++i) {
+      int64_t pos = i * diag_step + i;
+      data[pos] = diag_val;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context &dev_ctx,
+                         const ScalarArray &shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor *out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
new file mode 100644
index 00000000000..9382b03cf93
--- /dev/null
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.1 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.1
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+// Aligned vector generates vectorized load/store on CUDA.
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) AlignedVector {
+  T val[Size];
+
+  HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
+  HOSTDEVICE inline T& operator[](int i) { return val[i]; }
+};
+
+template <typename T, int Size>
+HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
+  const AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<const AlignedVector<T, Size>*>(addr);
+  *vec = *addr_vec;
+}
+
+template <typename T, int Size>
+HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
+  AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<AlignedVector<T, Size>*>(addr);
+  *addr_vec = vec;
+}
+
+/*
+* Only the address of input data is the multiplier of 1,2,4, vectorized load
+* with corresponding multiplier-value is possible. Moreover, the maximum length
+* of vectorized load is 128 bits once. Hence, valid length of vectorized load
+* shall be determined under both former constraints.
+*/
+template <typename T>
+int GetVectorizedSize(const T* pointer) {
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value;  // NOLINT
+  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec8 == 0) {
+    /*
+    * Currently, decide to deal with no more than 4 data once while adopting
+    * vectorization load/store, if performance test shows that dealing with
+    * 8 data once in vectorization load/store does get optimized, return code
+    * below can be changed into " return std::min(8, valid_vec_size); " .
+    */
+    return std::min(4, valid_vec_size);
+  } else if (address % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
new file mode 100644
index 00000000000..49e1c82482c
--- /dev/null
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -0,0 +1,249 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/generator.h"
+
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+#endif
+
+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
+namespace phi {
+namespace distribution {
+
+/********************* Transformation Function **********************/
+template <typename T>
+struct exponential_transform {
+  explicit exponential_transform(T lambda) : lambda_(lambda) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (std::is_same<T, double>::value) {
+      return static_cast<T>(-1.0) / lambda_ * log(val);
+    } else {
+      return static_cast<T>(-1.0) / lambda_ * __logf(val);
+    }
+#else
+    return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
+#endif
+  }
+
+ private:
+  T lambda_;
+};
+
+template <typename T>
+struct uniform_transform {
+  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+    if (UNLIKELY(val == static_cast<T>(1.0))) {
+      return min_;
+    } else {
+      return val * range_ + min_;
+    }
+  }
+
+ private:
+  T range_;
+  T min_;
+};
+
+template <typename T>
+struct normal_transform {
+  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
+
+  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
+
+ private:
+  T mean_;
+  T std_;
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+namespace kps = phi::kps;
+
+/*********************** Distribution Function *************************/
+template <typename T>
+struct uniform_distribution;
+
+template <typename T>
+struct normal_distribution;
+
+#if defined(__NVCC__)
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+#else
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+#endif
+
+/******** Launch GPU function of distribution and transformation *********/
+template <typename T, typename DistOp, typename TransformOp>
+__global__ void DistributionKernel(size_t size,
+                                   uint64_t seed,
+                                   uint64_t offset,
+                                   DistOp dist,
+                                   TransformOp trans,
+                                   T *out_data,
+                                   size_t stride) {
+  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
+  static constexpr int kCount = DistOp::kReturnsCount;
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = curandStatePhilox4_32_10_t;
+#else
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = hiprandStatePhilox4_32_10_t;
+#endif
+  size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
+  T args[kCount];
+  T result[kCount];
+  for (size_t i = idx; i < size; i += total_thread * kCount) {
+    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
+    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
+        &result[0], &args[0], trans);
+    kps::WriteData<T, T, kCount, 1, 1, true>(
+        out_data + i, &result[0], size - i, 1, stride, 1);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename DistOp, typename TransformOp>
+void distribution_and_transform(const GPUContext &dev_ctx,
+                                DenseTensor *out,
+                                DistOp dist,
+                                TransformOp trans) {
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+
+  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = dev_ctx.GetGenerator();
+
+  size_t block_size = 256;
+  size_t expect_grid_size = (size + block_size - 1) / block_size;
+  const auto &prop = backends::gpu::GetDeviceProperties(device_id);
+  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
+                         prop.multiProcessorCount;
+  size_t grid_size =
+      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+
+  size_t total_thread = block_size * grid_size;
+  size_t curand4_loop_times =
+      (size + 4 * total_thread - 1) / (4 * total_thread);
+  // 'increment' shoulde be multiple of 4
+  uint64_t increment = curand4_loop_times * 4;
+
+  auto seed_offset = gen_cuda->IncrementOffset(increment);
+  uint64_t seed = seed_offset.first;
+  uint64_t offset = seed_offset.second;
+
+  DistributionKernel<
+      T,
+      DistOp,
+      TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      size, seed, offset, dist, trans, out_data, total_thread);
+}
+
+#endif
+}  // namespace distribution
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h
new file mode 100644
index 00000000000..ccb70fe25dd
--- /dev/null
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Functor, int VecSize>
+__global__ void VectorizedIndexKernel(T *out,
+                                      size_t numel,
+                                      size_t main_offset,
+                                      Functor func) {
+  size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  size_t args[VecSize];
+  T result[VecSize];
+  for (; data_offset < main_offset; data_offset += stride) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+        &result[0], &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, false>(
+        out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
+  }
+  size_t num = numel - data_offset;
+  if (num > 0) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+        &result[0], &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+  }
+}
+
+template <typename T, typename Functor>
+void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
+  int numel = out->numel();
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  if (numel <= 0) return;
+  int vec_size = phi::GetVectorizedSize(out_data);
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  int grid = 8;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+  int grid = config.block_per_grid.x;
+  int block = config.thread_per_block.x;
+  auto stream = dev_ctx.stream();
+#endif
+  size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
+  switch (vec_size) {
+    case 4:
+      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 2:
+      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 1:
+      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    default: {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
new file mode 100644
index 00000000000..7f24a6667e5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+
+#include "gflags/gflags.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  __host__ __device__ UniformGenerator(
+      T min, T max, int seed, int diag_num, int diag_step, T diag_val)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min,
+                                             T max,
+                                             int seed,
+                                             int diag_num,
+                                             int diag_step,
+                                             T diag_val,
+                                             int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor* out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  T* data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  auto generator = dev_ctx.GetGenerator();
+  if (generator->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename kps::details::MPTypeTrait<T>::Type;
+      distribution::uniform_distribution<MT> dist;
+      distribution::uniform_transform<MT> trans(min, max);
+      distribution::distribution_and_transform<T>(dev_ctx, out, dist, trans);
+    } else {
+      auto seed_offset = generator->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func = UniformGeneratorOffset<T>(min,
+                                            max,
+                                            seed_offset.first,
+                                            diag_num,
+                                            diag_step,
+                                            diag_val,
+                                            gen_offset);
+      IndexKernel<T, UniformGeneratorOffset<T>>(dev_ctx, out, func);
+    }
+  } else {
+    auto func =
+        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor* out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(
+    uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {}
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
new file mode 100644
index 00000000000..881180b71b1
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniformRandomRawSRKernel(const Context& dev_ctx,
+                              const ScalarArray& shape,
+                              DataType dtype,
+                              float min,
+                              float max,
+                              int seed,
+                              int diag_num,
+                              int diag_step,
+                              float diag_val,
+                              SelectedRows* out) {
+  phi::UniformRandomRawKernel<T>(dev_ctx,
+                                 shape,
+                                 dtype,
+                                 min,
+                                 max,
+                                 seed,
+                                 diag_num,
+                                 diag_step,
+                                 diag_val,
+                                 out->mutable_value());
+}
+
+template <typename T, typename Context>
+void UniformRandomSRKernel(const Context& dev_ctx,
+                           const ScalarArray& shape,
+                           DataType dtype,
+                           float min,
+                           float max,
+                           int seed,
+                           SelectedRows* out) {
+  phi::UniformRandomKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawSRKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomSRKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+PD_REGISTER_KERNEL(uniform_random_raw_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawSRKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(uniform_random_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomSRKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h
new file mode 100644
index 00000000000..5bba1272785
--- /dev/null
+++ b/paddle/phi/kernels/uniform_random_kernel.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor* out);
+
+template <typename T, typename Context>
+void UniformRandomRawSRKernel(const Context& dev_ctx,
+                              const ScalarArray& shape,
+                              DataType dtype,
+                              float min,
+                              float max,
+                              int seed,
+                              int diag_num,
+                              int diag_step,
+                              float diag_val,
+                              SelectedRows* out);
+
+template <typename T, typename Context>
+void UniformRandomSRKernel(const Context& dev_ctx,
+                           const ScalarArray& shape,
+                           DataType dtype,
+                           float min,
+                           float max,
+                           int seed,
+                           SelectedRows* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/phi/ops/compat/uniform_random_sig.cc
new file mode 100644
index 00000000000..d06d4026f4f
--- /dev/null
+++ b/paddle/phi/ops/compat/uniform_random_sig.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UniformRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int diag_num = paddle::any_cast<int>(ctx.Attr("diag_num"));
+  if (ctx.IsDenseTensorOutput("Out")) {
+    if (diag_num) {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature("uniform_random_raw",
+                               {},
+                               {"ShapeTensorList",
+                                "dtype",
+                                "min",
+                                "max",
+                                "seed",
+                                "diag_num",
+                                "diag_step",
+                                "diag_val"},
+                               {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_raw",
+                                 {},
+                                 {"ShapeTensor",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_raw",
+                                 {},
+                                 {"shape",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        }
+      }
+    } else {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature(
+            "uniform_random",
+            {},
+            {"ShapeTensorList", "dtype", "min", "max", "seed"},
+            {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random",
+                                 {},
+                                 {"ShapeTensor", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random",
+                                 {},
+                                 {"shape", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        }
+      }
+    }
+  } else if (ctx.IsSelectedRowsOutput("Out")) {
+    if (diag_num) {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature("uniform_random_raw_sr",
+                               {},
+                               {"ShapeTensorList",
+                                "dtype",
+                                "min",
+                                "max",
+                                "seed",
+                                "diag_num",
+                                "diag_step",
+                                "diag_val"},
+                               {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_raw_sr",
+                                 {},
+                                 {"ShapeTensor",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_raw_sr",
+                                 {},
+                                 {"shape",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        }
+      }
+    } else {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature(
+            "uniform_random_sr",
+            {},
+            {"ShapeTensorList", "dtype", "min", "max", "seed"},
+            {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_sr",
+                                 {},
+                                 {"ShapeTensor", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_sr",
+                                 {},
+                                 {"shape", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping);
-- 
GitLab


From 4fbcf6f4c52adccbc6ea0786b302485f14e5a951 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 1 Mar 2022 11:51:22 +0800
Subject: [PATCH 014/272] [PHI] Remove reseting dtype, layout and allocation by
 arg_def for outputs in executor (#39781)

* remove SetAllocationForOutputTenosr

* add place param for copy kernel

* recover SetAllocationForOutputTenosr

* polish code

* fix empty_dev api bug

* remove reseting dtype and layout for output in executor

* fix merge bug

* [Phi] Add ClearHolder when re-alloc on new place in DeviceContext

* fix hostAlloc

* remove setting output allocation

* remove full_kernel_impl.h

* fix bug of xpu full_like

Co-authored-by: Aurelius84 <zhangliujie@baidu.com>
---
 paddle/fluid/framework/operator.cc          |  6 ------
 paddle/fluid/framework/phi_utils.cc         | 21 ---------------------
 paddle/fluid/framework/phi_utils.h          |  3 ---
 paddle/fluid/imperative/prepared_operator.h |  6 ------
 paddle/phi/api/lib/utils/tensor_utils.cc    | 21 ---------------------
 paddle/phi/api/lib/utils/tensor_utils.h     |  3 ---
 paddle/phi/core/CMakeLists.txt              |  2 +-
 paddle/phi/core/dense_tensor_impl.cc        |  5 -----
 paddle/phi/kernels/xpu/full_kernel.cc       |  3 ++-
 9 files changed, 3 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 36208c41ed5..b12ad552aba 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2108,12 +2108,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
             "Unsupported output `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
 
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 1a39a87fb99..93bc2c02d57 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -233,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor(
   dense_tensor->ResetHolder(shared_allocation);
 }
 
-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place) {
-  if (phi::DenseTensor::classof(tensor)) {
-    auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
-    if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(dense_tensor, place);
-    }
-  } else if (phi::SelectedRows::classof(tensor)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
-    if (!selected_rows->value().IsInitialized() ||
-        !(selected_rows->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
-                                               place);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported tensor type is received when setting allocation for "
-        "output tensor."));
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 1a1f79d8277..a1757881692 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -62,9 +62,6 @@ class KernelArgsNameMaker {
 
 void InitDefaultKernelSignatureMap();
 
-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place);
-
 // TODO(Wilber): support others device context.
 template <typename T>
 struct ConvertToPhiContext {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 8e1e2fbe9a1..3b5762720e7 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext(
             "Unsupported output `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      framework::SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 31325e22afa..1c9f7c3a868 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
   return result;
 }
 
-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def) {
-  VLOG(5) << "ResetTensor by TensorArgDef.";
-  if (phi::DenseTensor::classof(dst)) {
-    auto* dense_t = static_cast<phi::DenseTensor*>(dst);
-    auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else if (phi::SelectedRows::classof(dst)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
-    auto* meta =
-        phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Unsupported tensor type is received when reseting tensor dtype and "
-        "layout by argument definition."));
-  }
-}
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 8b30d5421ab..64df59c1a2a 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
 phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def);
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index f4f57a0acbb..8ffacbb39bb 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
 cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
 
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 29e7dc01f32..5ee83089589 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
 // Note: When you reset holder, you need to ensure the offset is correct
 void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
   if (holder_) {
-    // TODO(zyfncg): The change of static_cast<> in check will recover back
-    // when SetAllocationForOutputTenosr is deleted.
-    // Now the numel() may return -1, and will cast to a very large number when
-    // compare with a data with unsigned long type, this will make checking
-    // failed, so it's a temporary solution to deal with this problem.
     PADDLE_ENFORCE_LE(
         numel() * static_cast<int64_t>(SizeOf(dtype())) +
             static_cast<int64_t>(meta_.offset),
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 574f4e991a2..d43126d56e8 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype,
                 DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
   FullValueXPU<T>(dev_ctx, out, val.to<T>());
 }
 
@@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx,
                     const Scalar& val,
                     DataType dtype,
                     DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
   auto value = val.to<float>();
   using XPUInTDType = typename XPUTypeTrait<T>::Type;
   using CommonType = typename std::common_type<
-- 
GitLab


From 468a2a17ce13a43452bbaf6888de4e18e15f063f Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Tue, 1 Mar 2022 13:11:37 +0800
Subject: [PATCH 015/272] [phi] migrate where kernel into phi (#39811)

---
 paddle/fluid/operators/where_op.cc            |  46 ++-----
 paddle/fluid/operators/where_op.cu            | 126 ------------------
 paddle/fluid/operators/where_op.h             |  73 ----------
 paddle/fluid/operators/where_op_npu.cc        |   2 +-
 paddle/fluid/operators/where_op_xpu.cc        |   2 +-
 paddle/phi/infermeta/binary.cc                |   3 +-
 paddle/phi/infermeta/multiary.cc              |  25 ++++
 paddle/phi/infermeta/multiary.h               |   4 +
 paddle/phi/kernels/cpu/atan2_grad_kernel.cc   |   5 +-
 paddle/phi/kernels/cpu/atan2_kernel.cc        |   5 +-
 paddle/phi/kernels/cpu/where_grad_kernel.cc   |  54 ++++++++
 paddle/phi/kernels/cpu/where_kernel.cc        |  40 ++++++
 paddle/phi/kernels/gpu/atan2_grad_kernel.cu   |   5 +-
 paddle/phi/kernels/gpu/atan2_kernel.cu        |   5 +-
 paddle/phi/kernels/gpu/where_grad_kernel.cu   |  64 +++++++++
 paddle/phi/kernels/gpu/where_kernel.cu        |  48 +++++++
 .../phi/kernels/impl/atan2_grad_kernel_impl.h |   5 +-
 paddle/phi/kernels/impl/atan2_kernel_impl.h   |   5 +-
 paddle/phi/kernels/where_grad_kernel.h        |  33 +++++
 paddle/phi/kernels/where_kernel.h             |  31 +++++
 paddle/phi/ops/compat/where_grad_sig.cc       |  28 ++++
 21 files changed, 352 insertions(+), 257 deletions(-)
 delete mode 100644 paddle/fluid/operators/where_op.cu
 delete mode 100644 paddle/fluid/operators/where_op.h
 create mode 100644 paddle/phi/kernels/cpu/where_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/where_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/where_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/where_kernel.cu
 create mode 100644 paddle/phi/kernels/where_grad_kernel.h
 create mode 100644 paddle/phi/kernels/where_kernel.h
 create mode 100644 paddle/phi/ops/compat/where_grad_sig.cc

diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
index 92ed2bbdc33..0f10efefa13 100644
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/where_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 namespace paddle {
 namespace operators {
 
@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
-
-    auto cond_dims = ctx->GetInputDim("Condition");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        cond_dims, x_dims,
-        platform::errors::InvalidArgument(
-            "The dims of Inputs(Condition) and Inputs(X) should be same. "
-            "But received Condition's shape is [%s], X's shape is [%s]",
-            cond_dims, x_dims));
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      platform::errors::InvalidArgument(
-                          "The dims of Inputs(X) and Inputs(Y) should be same. "
-                          "But received X's shape is [%s], Y's shape is [%s]",
-                          x_dims, y_dims));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PT_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                   ops::WhereOpGradMaker<paddle::framework::OpDesc>,
-                  ops::WhereOpGradMaker<paddle::imperative::OpBase>);
+                  ops::WhereOpGradMaker<paddle::imperative::OpBase>,
+                  WhereInferShapeFunctor);
 
 REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
                   ops::WhereGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
deleted file mode 100644
index 61a1691e4fe..00000000000
--- a/paddle/fluid/operators/where_op.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace platform = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CondFunctor {
-  HOSTDEVICE inline CondFunctor() {}
-
-  HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
-    return cond ? x : y;
-  }
-};
-
-template <typename T>
-__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
-                                const T* y, T* out) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out[idx] = cond[idx] ? x[idx] : y[idx];
-  }
-}
-
-template <typename T>
-__global__ void WhereGradCUDAKernel(const int N, const T* dout,
-                                    const bool* cond, T* dx, T* dy) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    if (dx != nullptr) {
-      dx[idx] = cond[idx] ? dout[idx] : 0.;
-    }
-    if (dy != nullptr) {
-      dy[idx] = cond[idx] ? 0. : dout[idx];
-    }
-  }
-}
-
-template <typename T>
-class WhereKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto numel = condition->numel();
-
-    // TODO(GaaoWei8): Input of where can be broadcast
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto functor = CondFunctor<T>();
-    std::vector<const framework::Tensor*> ins = {condition, X, Y};
-    std::vector<framework::Tensor*> outs = {out};
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-template <typename T>
-class WhereGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    const bool* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dout = dout_t->data<T>();
-    T* dx =
-        (dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dy =
-        (dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
-    WhereGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        numel, dout, cond_data, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    where_grad,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h
deleted file mode 100644
index 5398ee024a2..00000000000
--- a/paddle/fluid/operators/where_op.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class WhereKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto x_numel = X->numel();
-    for (int i = 0; i < x_numel; i++) {
-      out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WhereGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::LoDTensor>("Condition");
-    const auto* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    auto* dout = dout_t->data<T>();
-    if (dx_t != nullptr) {
-      auto* dx = dx_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
-      }
-    }
-    if (dy_t != nullptr) {
-      auto* dy = dy_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc
index d4294393daa..35508950941 100755
--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/where_op_xpu.cc b/paddle/fluid/operators/where_op_xpu.cc
index 3a4875c0700..41232c8b5e8 100644
--- a/paddle/fluid/operators/where_op_xpu.cc
+++ b/paddle/fluid/operators/where_op_xpu.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 1905e33bd03..675e68af743 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -306,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x,
 }
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  auto in_dims = x.dims();
-  out->set_dims(in_dims);
+  out->share_meta(x);
 }
 
 void BCELossInferMeta(const MetaTensor& input,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 8857c2cf424..7634e5e01ac 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -133,4 +133,29 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
   out->share_lod(*x.at(0));
 }
 
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out) {
+  auto cond_dims = condition.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      cond_dims,
+      x_dims,
+      phi::errors::InvalidArgument(
+          "The dims of Inputs(Condition) and Inputs(X) should be same. "
+          "But received Condition's shape is [%s], X's shape is [%s]",
+          cond_dims,
+          x_dims));
+  PADDLE_ENFORCE_EQ(x_dims,
+                    y_dims,
+                    phi::errors::InvalidArgument(
+                        "The dims of Inputs(X) and Inputs(Y) should be same. "
+                        "But received X's shape is [%s], Y's shape is [%s]",
+                        x_dims,
+                        y_dims));
+  out->share_meta(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 473845c6e40..2afb79daa35 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -30,4 +30,8 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out);
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
index 6ff7431f0c8..7a519aab0ad 100644
--- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
index eb38a6c90b7..df6f5f59ac0 100644
--- a/paddle/phi/kernels/cpu/atan2_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/where_grad_kernel.cc b/paddle/phi/kernels/cpu/where_grad_kernel.cc
new file mode 100644
index 00000000000..67c8cee1038
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_grad_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  const auto* cond_data = condition.data<bool>();
+  auto numel = condition.numel();
+  auto* dout = out_grad.data<T>();
+
+  if (x_grad != nullptr) {
+    auto* dx = ctx.template Alloc<T>(x_grad);
+    for (int i = 0; i < numel; i++) {
+      dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
+    }
+  }
+  if (y_grad != nullptr) {
+    auto* dy = ctx.template Alloc<T>(y_grad);
+    for (int i = 0; i < numel; i++) {
+      dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/where_kernel.cc b/paddle/phi/kernels/cpu/where_kernel.cc
new file mode 100644
index 00000000000..f624c13c262
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_kernel.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  const bool* cond_data = condition.data<bool>();
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  auto x_numel = x.numel();
+
+  T* out_data = ctx.template Alloc<T>(out);
+
+  for (int i = 0; i < x_numel; i++) {
+    out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
index 1cc3311c363..6652d242de5 100644
--- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
index 702c959b78f..dd0bba177de 100644
--- a/paddle/phi/kernels/gpu/atan2_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/atan2_kernel.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu
new file mode 100644
index 00000000000..f21aca80e21
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_grad_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void WhereGradCUDAKernel(
+    const int N, const T* dout, const bool* cond, T* dx, T* dy) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < N; idx += blockDim.x * gridDim.x) {
+    if (dx != nullptr) {
+      dx[idx] = cond[idx] ? dout[idx] : 0.;
+    }
+    if (dy != nullptr) {
+      dy[idx] = cond[idx] ? 0. : dout[idx];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  const bool* cond_data = condition.data<bool>();
+  auto numel = condition.numel();
+  auto* dout = out_grad.data<T>();
+
+  T* dx = (x_grad != nullptr) ? ctx.template Alloc<T>(x_grad) : nullptr;
+  T* dy = (y_grad != nullptr) ? ctx.template Alloc<T>(y_grad) : nullptr;
+
+  auto stream = ctx.stream();
+  auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
+  WhereGradCUDAKernel<
+      T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+      numel, dout, cond_data, dx, dy);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
new file mode 100644
index 00000000000..03c24eea3a9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_kernel.h"
+
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+// Cond
+template <typename T>
+struct CondFunctor {
+  inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const {
+    return cond ? x : y;
+  }
+};
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  std::vector<const DenseTensor*> ins = {&condition, &x, &y};
+  std::vector<DenseTensor*> outs = {out};
+  ctx.template Alloc<T>(out);
+
+  CondFunctor<T> func;
+  funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+      ctx, ins, &outs, -1, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
index d0dd1829851..0eff1378f41 100644
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index 2cae914e2f6..7653032f211 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/where_grad_kernel.h b/paddle/phi/kernels/where_grad_kernel.h
new file mode 100644
index 00000000000..1a3c66ee6ed
--- /dev/null
+++ b/paddle/phi/kernels/where_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_kernel.h b/paddle/phi/kernels/where_kernel.h
new file mode 100644
index 00000000000..254271ac9c7
--- /dev/null
+++ b/paddle/phi/kernels/where_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/where_grad_sig.cc b/paddle/phi/ops/compat/where_grad_sig.cc
new file mode 100644
index 00000000000..71984a26d35
--- /dev/null
+++ b/paddle/phi/ops/compat/where_grad_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("where_grad",
+                         {"Condition", "X", "Y", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping);
-- 
GitLab


From a7acfc5b357b8d7de29bd3cf240309c2deb72a2e Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 1 Mar 2022 13:16:45 +0800
Subject: [PATCH 016/272] update error_string when target is out of bound
 (#40001)

---
 python/paddle/nn/functional/loss.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e59ef5ebfb0..e6efde83628 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1667,11 +1667,11 @@ def cross_entropy(input,
             label_min = paddle.min(valid_label)
             label_max = paddle.max(valid_label)
             if label_min < 0:
-                raise ValueError("label should not out of bound, but got{}".
-                                 format(label_min))
+                raise ValueError("Target {} is out of lower bound.".format(
+                    label_min.item()))
             if label_max >= input.shape[axis]:
-                raise ValueError("label should not out of bound, but got{}".
-                                 format(label_max))
+                raise ValueError("Target {} is out of upper bound.".format(
+                    label_max.item()))
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
-- 
GitLab


From 4204b97ab350298812dd56fb4a5eac504b848aae Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Tue, 1 Mar 2022 13:53:39 +0800
Subject: [PATCH 017/272] change tests_v2 to dynamic_tests_v2 in CI op
 benchmark (#39995)

---
 tools/ci_op_benchmark.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 80efd32ecf1..1db79418b2d 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -106,7 +106,7 @@ function prepare_benchmark_environment {
   [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1
   LOG "[INFO] Collect api info ..."
   python benchmark/api/deploy/collect_api_info.py \
-      --test_module_name tests_v2                 \
+      --test_module_name dynamic_tests_v2         \
       --info_file api_info.txt >& 2
   [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1
   [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1
@@ -185,7 +185,7 @@ function run_op_benchmark_test {
     logs_dir="$(pwd)/logs-${branch_name}"
     [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir
     pushd benchmark/api > /dev/null
-    bash deploy/main_control.sh tests_v2 \
+    bash deploy/main_control.sh dynamic_tests_v2 \
                                 tests_v2/configs \
                                 $logs_dir \
                                 $VISIBLE_DEVICES \
@@ -212,7 +212,7 @@ function check_op_benchmark_result {
       # there is no need to recompile and install paddle
       LOG "[INFO] retry ${retry_time} times ..."
       pushd benchmark/api > /dev/null
-      bash deploy/main_control.sh tests_v2 \
+      bash deploy/main_control.sh dynamic_tests_v2 \
                                   tests_v2/configs \
                                   ${logs_dir} \
                                   $VISIBLE_DEVICES \
-- 
GitLab


From 9de798928509d5bc7e213c385ef565fc7ecfa3dc Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Tue, 1 Mar 2022 14:45:38 +0800
Subject: [PATCH 018/272] add MasterParam and MasterParamOut for
 sparse_momentum op (#39969)

---
 paddle/fluid/pybind/op_function_generator.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 2b07a439d33..d23b3dd64ab 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
     {"merged_momentum",
      {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
-    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
+    {"sparse_momentum",
+     {"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
     {"run_program", {"X", "Params"}},
     {"fused_feedforward",
@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
     {"run_program", {"DOut"}},
     {"adam",
@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
       "out_old_num_accumulates", "out_num_updates"}},
     {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
-- 
GitLab


From 6d26b332d9fee77f16a8655c8ead3f21f2805975 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Tue, 1 Mar 2022 14:52:54 +0800
Subject: [PATCH 019/272] [bf16] add bf16 kernel: scale  gather  sum (#39683)

* add scale gather sum

* refine CUDA_ATOMIC_WRAPPER ADD for bf16

* add gather unittest

* solve conflict

* add scale uinttest

* add sum unittest

* solve conflict

* refine gather unittest

* refine unittest
---
 paddle/fluid/operators/gather_op.cc           |  6 +-
 paddle/fluid/operators/gather_op.cu           |  6 +-
 .../operators/math/selected_rows_functor.cu   |  2 +
 paddle/fluid/operators/sum_op.cu              |  3 +-
 .../platform/device/gpu/gpu_primitives.h      | 67 +++++++++++++++++++
 paddle/phi/kernels/gpu/scale_kernel.cu        |  1 +
 .../paddle/fluid/tests/unittests/op_test.py   |  7 +-
 .../fluid/tests/unittests/test_gather_op.py   | 35 +++++++++-
 .../fluid/tests/unittests/test_scale_op.py    | 19 +++++-
 .../fluid/tests/unittests/test_sum_op.py      | 26 +++++++
 10 files changed, 164 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index cf4d7b1d670..8a405cc6fc1 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
                        ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>);
+                       ops::GatherOpKernel<int64_t>,
+                       ops::GatherOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
                        ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>);
+                       ops::GatherGradientOpKernel<int64_t>,
+                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_VERSION(gather)
     .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                    paddle::framework::compatible::OpVersionDesc().NewInput(
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 19568835a6e..a502a130409 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
                         ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<plat::float16>);
+                        ops::GatherOpCUDAKernel<plat::float16>,
+                        ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
                         ops::GatherGradOpCUDAKernel<double>,
                         ops::GatherGradOpCUDAKernel<int64_t>,
                         ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>);
+                        ops::GatherGradOpCUDAKernel<plat::float16>,
+                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 8563d8b05b1..a4678550cf7 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
 template struct MergeAdd<platform::CUDADeviceContext,
                          platform::complex<double>>;
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 3e2d2a5495b..33590c1d7cc 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 8aec8e840f3..803674779e7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
 #endif
 #endif
 
+// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
+inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
+  bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
+  bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
+  return *reinterpret_cast<bfloat16 *>(&x);
+}
+
+static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
+  return *reinterpret_cast<__nv_bfloat16 *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
+                                    PDBF16ToCUDABF16(val)));
+}
+#else
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
   float *imag = real + 1;
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index d9c8de21c5b..930c50a24be 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 848ebae0706..5694ef25c79 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -482,7 +482,12 @@ class OpTest(unittest.TestCase):
 
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
         "infer datatype from inputs and outputs for this test case"
-        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        if self.is_bfloat16_op():
+            self.dtype = np.uint16
+            self.__class__.dtype = self.dtype
+            self.output_dtype = np.uint16
+        else:
+            self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
         inputs = append_input_output(block, op_proto, self.inputs, True,
                                      self.dtype)
         outputs = append_input_output(block, op_proto, self.outputs, False,
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 83b39a62f15..978a3d86d88 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 from paddle.framework import core
@@ -117,6 +117,39 @@ class TestCase6(TestGatherOp):
         self.index_type = "int32"
 
 
+class TestGatherBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            'X': convert_float_to_uint16(xnp),
+            'Index': index_np,
+            'Axis': axis_np
+        }
+        out = gather_numpy(self.inputs['X'], index_np, axis_np[0])
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.5)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
 class TestGatherOp1(OpTest):
     def setUp(self):
         self.op_type = "gather"
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index c1ce032f506..d432b8057f6 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp):
                 place, ["X"], "Out", max_relative_error=0.05)
 
 
+class TestScaleBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.uint16
+        self.attrs = {'scale': -2.3}
+        x = np.random.random((10, 10)).astype(np.float32)
+        out = x * np.float32(self.attrs['scale'])
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.8)
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index eddccd4ff24..7040145a768 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent):
     globals()[cls_name] = TestSumFp16Case
 
 
+#----------- test bf16 -----------
+class TestSumBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(np.float32)
+        x1 = np.random.random((3, 40)).astype(np.float32)
+        x2 = np.random.random((3, 40)).astype(np.float32)
+        y = x0 + x1 + x2
+        self.inputs = {
+            "X": [("x0", convert_float_to_uint16(x0)),
+                  ("x1", convert_float_to_uint16(x1)),
+                  ("x2", convert_float_to_uint16(x2))]
+        }
+        self.outputs = {'Out': convert_float_to_uint16(y)}
+
+    def init_kernel_type(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
+
+
 class API_Test_Add_n(unittest.TestCase):
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-- 
GitLab


From 25650774d9623a3975567fa9f9b9a35b928ffce2 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Tue, 1 Mar 2022 14:57:54 +0800
Subject: [PATCH 020/272] add test_warpctc_op in mac (#39983)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2361bd27062..7d64cf7bd89 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE)
     py_test_modules(test_warpctc_op MODULES test_warpctc_op)
     set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
 endif()
-- 
GitLab


From fc06be9dbd82da832c8eed8cac8573d0166638ba Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 1 Mar 2022 15:08:27 +0800
Subject: [PATCH 021/272] remove conv_affine_channel_fuse_pass (#39817)

* remove

* pass

* more pass
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 -
 .../ir/conv_affine_channel_fuse_pass.cc       | 420 ------------------
 .../ir/conv_affine_channel_fuse_pass.h        |  54 ---
 .../inference/api/paddle_pass_builder.cc      |  56 ++-
 .../quantization/quant2_int8_mkldnn_pass.py   |   3 -
 .../test_conv_affine_channel_fuse_pass.py     | 160 -------
 ...onv_eltwiseadd_affine_channel_fuse_pass.py | 183 --------
 tools/parallel_UT_rule.py                     |   2 -
 tools/static_mode_white_list.py               |   1 -
 9 files changed, 25 insertions(+), 855 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
 delete mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
 delete mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
 delete mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index dad5358590c..0d53a54ff82 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
-pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
deleted file mode 100644
index f28c9988bd8..00000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ /dev/null
@@ -1,420 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
-
-#include <cmath>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Node;
-
-#define GET_CONV_BN_NODES(pattern_name)                                    \
-  /* OPERATORS */                                                          \
-  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
-  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
-  /* CONV inputs */                                                        \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
-  /* CONV outputs */                                                       \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
-  /* Affine Channel inputs */                                              \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
-  /* Affine channel outputs */                                             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
-
-void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
-                                const ir::Node& ac_scale,
-                                const LoDTensor& ac_bias_tensor,
-                                LoDTensor* eltwise_y_in_tensor) {
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using ConstEigenVectorArrayMap =
-      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using EigenMatrixArrayMap = Eigen::Map<
-      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(
-      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
-      platform::errors::InvalidArgument(
-          "Tensor elementwise y(%d) and activation bias(%d) must have same "
-          "dimension.",
-          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
-
-  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
-
-  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-                                       scale_tensor->numel(), 1);
-  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
-                                         ac_bias_tensor.numel(), 1);
-
-  EigenVectorArrayMap eltwise_y_in_array(
-      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-      eltwise_y_in_tensor->numel(), 1);
-
-  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
-
-  // Re-compute weight of conv2d from AffineChannel
-  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  auto weights_shape = weights->dims();
-  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
-  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
-
-  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
-                                       weights_shape_2d[1]);
-
-  weights_array_2d.colwise() *= scale_array;
-
-  // Check for subnormal values that slows down convolution execution
-  for (int i = 0; i < weights->numel(); ++i) {
-    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
-  }
-}
-
-ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvAffineChannel fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
-                                 "it's wrong if data_format of conv is not "
-                                 "NCHW.";
-    }
-
-    // Get affine_channel bias for resizing eltwise_y!
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    // Create eltwise_y (conv bias) variable
-    VarDesc eltwise_y_in_desc(
-        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
-    // Set shape && datatype manually
-    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
-    eltwise_y_in_desc.SetDataType(
-        framework::TransToProtoVarType(ac_bias_tensor->dtype()));
-    eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
-    eltwise_y_in_desc.SetPersistable(true);
-
-    // Initialize eltwise_y
-    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
-    auto* eltwise_y_in_tensor =
-        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
-    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
-    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-                eltwise_y_in_tensor->numel(), 0.0f);
-
-    // update weights and biases
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // create an elementwise add node.
-    OpDesc desc;
-    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-    desc.SetType("elementwise_add");
-    desc.SetAttr("axis", 1);
-    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
-
-    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
-
-    IR_NODE_LINK_TO(conv_out, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_op, ac_out);
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_ac_count);
-}
-
-ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING)
-          << "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvBN fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
-                                 "enabled, it's wrong if data_format of conv "
-                                 "is not NCHW.";
-    }
-    // OPERATORS
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
-    // BIAS inputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
-    // BIAS outputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
-
-    // Get eltwise_y (conv bias) variable
-    auto* eltwise_y_in_tensor =
-        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
-
-    // Get batch norm bias
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // Update the elementwise_add node
-    eltwise->Op()->SetAttr("axis", 1);
-    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-
-    GraphSafeRemoveNodes(graph,
-                         {ac_scale, ac_bias, affine_channel, eltwise_out});
-
-    IR_NODE_LINK_TO(eltwise, ac_out);
-
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_conv_ac_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvAffineChannelFusePass);
-REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
-REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .EQ("affine_channel", 0));
-REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .LE("elementwise_add", 1)
-            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
deleted file mode 100644
index 8cfaf5c6a89..00000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the Conv and ConvAffineChannel.
- */
-class Graph;
-
-class ConvAffineChannelFusePass : public FusePassBase {
- public:
-  ConvAffineChannelFusePass();
-  virtual ~ConvAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_affine_channel_fuse"};
-};
-
-class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
- public:
-  ConvEltwiseAddAffineChannelFusePass();
-  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 313e1f2faea..f5f36d805b4 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
-  "conv_affine_channel_fuse_pass",  //
-      "adaptive_pool2d_convert_global_pass",
-      "conv_eltwiseadd_affine_channel_fuse_pass",  //
-      "shuffle_channel_detect_pass",               //
-      "quant_conv2d_dequant_fuse_pass",            //
-      "delete_quant_dequant_op_pass",              //
-      "delete_quant_dequant_filter_op_pass",       //
+  "adaptive_pool2d_convert_global_pass",
+      "shuffle_channel_detect_pass",          //
+      "quant_conv2d_dequant_fuse_pass",       //
+      "delete_quant_dequant_op_pass",         //
+      "delete_quant_dequant_filter_op_pass",  //
       // "fc_fuse_pass",                        //
       "simplify_with_basic_ops_pass",                 //
       "embedding_eltwise_layernorm_fuse_pass",        //
@@ -134,22 +132,20 @@ const std::vector<std::string> kLiteSubgraphPasses({
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     //   "identity_scale_op_clean_pass",             //
-    "is_test_pass",                                  //
-        "simplify_with_basic_ops_pass",              //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_eltwiseadd_bn_fuse_pass",              //
-        "embedding_eltwise_layernorm_fuse_pass",     //
-        "multihead_matmul_fuse_pass_v2",             //
-        "gpu_cpu_squeeze2_matmul_fuse_pass",         //
-        "gpu_cpu_reshape2_matmul_fuse_pass",         //
-        "gpu_cpu_flatten2_matmul_fuse_pass",         //
-        "gpu_cpu_map_matmul_v2_to_mul_pass",         //
-        "gpu_cpu_map_matmul_v2_to_matmul_pass",      //
-        "gpu_cpu_map_matmul_to_mul_pass",            //
-        "fc_fuse_pass",                              //
-        "fc_elementwise_layernorm_fuse_pass",        //
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        "fc_fuse_pass",                           //
+        "fc_elementwise_layernorm_fuse_pass",     //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
@@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",     //
-             "conv_bn_fuse_pass",              // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
-             "conv_affine_channel_fuse_pass",  //
-             "conv_eltwiseadd_affine_channel_fuse_pass",  //
-             "conv_transpose_bn_fuse_pass",               //
-             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
-             "conv_bias_mkldnn_fuse_pass",                //
+             "depthwise_conv_mkldnn_pass",    //
+             "conv_bn_fuse_pass",             // Execute BN passes again to
+             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
+             "conv_transpose_bn_fuse_pass",   //
+             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
+             "conv_bias_mkldnn_fuse_pass",              //
              "conv_transpose_bias_mkldnn_fuse_pass",
              // TODO(baoachun): Need to support 5-dimensional input.
              // "conv3d_bias_mkldnn_fuse_pass",  //
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index d5bc2e6b530..9d9fbd39a57 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object):
         graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass')
-        graph = self._apply_pass(graph,
-                                 'conv_eltwiseadd_affine_channel_fuse_pass')
         graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
         graph = self._apply_pass(graph,
                                  'conv_transpose_eltwiseadd_bn_fuse_pass')
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
deleted file mode 100644
index 5afaf08eec3..00000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume, reproduce_failure
-import hypothesis.strategies as st
-
-
-class TestConvAffineChannelFusePass(PassAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_config(self, draw):
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
-        groups = draw(st.integers(min_value=1, max_value=3))
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
-        axis = draw(st.sampled_from([1]))
-        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
-        filter_size = draw(st.integers(min_value=1, max_value=4))
-        in_channel = groups * filter_channel
-        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
-        out_channel = groups * out_channel_factor
-        batch_size = draw(st.integers(min_value=1, max_value=4))
-        dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
-        strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        has_bias = draw(st.booleans())
-
-        x_shape = [
-            batch_size, in_channel, 64, 64
-        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
-        w_shape = [out_channel, filter_channel, filter_size, filter_size]
-        scale_shape = [out_channel]
-        bias_shape = [out_channel]
-
-        def generate_input():
-            return np.random.random(x_shape).astype(np.float32)
-
-        def generate_weight():
-            return np.random.random(w_shape).astype(np.float32)
-
-        def generate_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        def generate_scale_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["conv_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
-        if has_bias == True:
-            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
-        ops = [conv2d_op, ac_op]
-
-        program_config = ProgramConfig(
-            ops=ops,
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input)),
-            },
-            weights={
-                "conv2d_weight":
-                TensorConfig(data_gen=partial(generate_weight)),
-                "affine_channel_scale":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-                "affine_channel_bias":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-            },
-            outputs=["affine_channel_ouput"])
-        if has_bias == True:
-            program_config.weights["conv2d_bias"] = TensorConfig(
-                data_gen=partial(generate_bias))
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-    def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
-        # in is_program_valid needs to be deleted!!!
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        # mkldnn Output has diff with bias!
-        def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
-
-        self.add_ignore_check_case(
-            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC, \
-            because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
-        )
-
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Currently mkldnn Output has diff with bias!")
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            passes=["conv_affine_channel_fuse_pass"], )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
deleted file mode 100644
index a8bfdb79ca1..00000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume
-import hypothesis.strategies as st
-
-
-class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-
-        if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3:
-            return False
-
-        return True
-
-    def sample_program_config(self, draw):
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
-        groups = draw(st.integers(min_value=1, max_value=3))
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
-        axis = draw(st.sampled_from([1]))
-        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
-        filter_size = draw(st.integers(min_value=1, max_value=4))
-        in_channel = groups * filter_channel
-        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
-        out_channel = groups * out_channel_factor
-        batch_size = draw(st.integers(min_value=1, max_value=4))
-        dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
-        strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        has_bias = draw(st.booleans())
-
-        x_shape = [
-            batch_size, in_channel, 64, 64
-        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
-        w_shape = [out_channel, filter_channel, filter_size, filter_size]
-        scale_shape = [out_channel]
-        bias_shape = [out_channel]
-
-        def generate_input():
-            return np.random.random(x_shape).astype(np.float32)
-
-        def generate_weight():
-            return np.random.random(w_shape).astype(np.float32)
-
-        def generate_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        def generate_scale_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        eltwise_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv_output"],
-                    "Y": ["conv2d_bias"]},
-            outputs={"Out": ["elementwise_output"]},
-            axis=axis)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["elementwise_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
-        if has_bias == True:
-            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
-        ops = [conv2d_op, eltwise_op, ac_op]
-        program_config = ProgramConfig(
-            ops=ops,
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input)),
-            },
-            weights={
-                "conv2d_weight":
-                TensorConfig(data_gen=partial(generate_weight)),
-                "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
-                "affine_channel_scale":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-                "affine_channel_bias":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-            },
-            outputs=["affine_channel_ouput"])
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        # TRT
-        config = self.create_trt_inference_config()
-        config.enable_tensorrt_engine(
-            workspace_size=1 << 20,
-            max_batch_size=4,
-            min_subgraph_size=1,
-            precision_mode=paddle_infer.PrecisionType.Float32,
-            use_static=False,
-            use_calib_mode=False)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-    def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
-        # in is_program_valid needs to be deleted!!!
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        # mkldnn Output has diff with bias!
-        def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
-
-        self.add_ignore_check_case(
-            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC, \
-            it will trigger Broadcast dimension mismatch bug \
-            when data_format attribute is NHWC and axis of eltwise op is 1 for this pass."
-        )
-
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Currently mkldnn Output has diff with bias!")
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            passes=["conv_eltwiseadd_affine_channel_fuse_pass"], )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 4df27bfe4e9..7f8e516496f 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
     'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow',
     'test_inplace_softmax_with_cross_entropy', 'test_transforms',
     'test_unfold_op', 'test_assign_op', 'test_isinstance',
-    'test_conv_affine_channel_fuse_pass',
     'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op',
     'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary',
     'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op',
@@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [
     'test_dataloader_unkeep_order',
     'test_parallel_executor_profiler',
     'test_correlation',
-    'test_conv_affine_channel_fuse_pass',
     'test_ir_inplace_pass',
     'test_moving_average_abs_max_scale_op',
     'test_flatten_contiguous_range_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 694283264ca..7356f0c8db0 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_ir_embedding_eltwise_layernorm_fuse_pass',
     'test_ir_fc_fuse_pass',
     'test_ir_skip_layernorm_pass',
-    'test_conv_affine_channel_fuse_pass',
     'test_conv_bias_mkldnn_fuse_pass',
     'test_conv_bn_fuse_pass',
     'test_conv_elementwise_add2_act_fuse_pass',
-- 
GitLab


From ce8ed978cbfce2e0fa503690d31d2e3244066b31 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Tue, 1 Mar 2022 16:11:28 +0800
Subject: [PATCH 022/272] [bf16] add bf16 kernel: layer_norm p_norm reduce_sum
 (#39843)

* add layer norm

* add p norm

* add reduce sum

* refine layer norm register bf16 for cudnn811

* add bf16 cast for hip

* add unittest

* refine rocm

* refine layer_norm unittest

* refine reduce op

* refine unittest

* enhance atol for reduce unittest
---
 paddle/fluid/operators/cast_op.cu             |  4 -
 paddle/fluid/operators/layer_norm_kernel.cu.h |  6 +-
 paddle/fluid/operators/layer_norm_op.cu       | 15 ++++
 paddle/fluid/operators/p_norm_op.cu           | 12 +++
 .../reduce_ops/reduce_sum_op.part.cu          |  1 +
 paddle/phi/kernels/gpu/cast_kernel.cu         |  4 -
 paddle/phi/kernels/gpu/math_kernel.cu         |  1 +
 paddle/phi/kernels/math_kernel.cc             |  1 +
 .../paddle/fluid/tests/unittests/op_test.py   |  2 +-
 .../tests/unittests/test_layer_norm_op.py     | 47 ++++++++++++
 .../fluid/tests/unittests/test_norm_all.py    | 76 ++++++++++++++++++-
 .../fluid/tests/unittests/test_reduce_op.py   | 33 +++++++-
 12 files changed, 188 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 5c7dd0e2561..eb51215790b 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
       ops::CastOpKernel<CUDA, plat::complex<float>>,                      \
       ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
 
-#if !defined(PADDLE_WITH_HIP)
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
 REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
-#else
-REGISTER_CAST_CUDA_BASE(transfer_dtype)
-#endif
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index b31c7a1cde0..62c21dd2eee 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     for (int it = 0; it < LDGS; it++) {
 #pragma unroll
       for (int jt = 0; jt < VecSize; jt++) {
-        U x_tmp = x[it][jt];
+        U x_tmp = static_cast<U>(x[it][jt]);
         U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
         U dy_tmp = static_cast<U>(gamma[it][jt]) *
-                   static_cast<U>(dout[it][jt]);  // scale * dy
-        U dout_tmp = dout[it][jt];                // dy
+                   static_cast<U>(dout[it][jt]);    // scale * dy
+        U dout_tmp = static_cast<U>(dout[it][jt]);  // dy
 
         // used for get dx (row reduction)
         sum_loss1 += dy_tmp;          // scale * dy, sum_1
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index d439b3220d9..dfe73d37271 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>);
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::bfloat16>);
 #else
 REGISTER_OP_CUDA_KERNEL(
     layer_norm,
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index f2cb427a0a5..d0b78b9b064 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
 __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
   return static_cast<platform::float16>(abs(static_cast<float>(x)));
 }
+
+__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
+  return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
+}
+
 __device__ __forceinline__ float inline_abs(float x) { return abs(x); }
 __device__ __forceinline__ double inline_abs(double x) { return abs(x); }
 
@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
   return static_cast<platform::float16>(
       pow(static_cast<float>(base), static_cast<float>(exponent)));
 }
+__device__ __forceinline__ platform::bfloat16 inline_pow(
+    platform::bfloat16 base, platform::bfloat16 exponent) {
+  return static_cast<platform::bfloat16>(
+      pow(static_cast<float>(base), static_cast<float>(exponent)));
+}
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(p_norm,
                         ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
+                        ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
                         ops::PnormCUDAKernel<CUDA, float>,
                         ops::PnormCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(
     p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
+    ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
     ops::PnormGradCUDAKernel<CUDA, float>,
     ops::PnormGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
index c3d3e0cf6ec..2f6bf127518 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL(
     reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
     CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
     CUDAReduceSumGradKernel<paddle::platform::float16>,
+    CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
     CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
     CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
     CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 7a6c99c5fe1..569a46f56d5 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx,
         paddle::experimental::DataType::UNDEFINED);     \
   }
 
-#if !defined(PADDLE_WITH_HIP)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16)
-#else
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
-#endif
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index 56e8b16ccbe..fc73ccca6de 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw,
                    float,
                    double,
                    float16,
+                   bfloat16,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 3cb7b66ddf7..480eb56c8b0 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int16_t,
                    int,
                    int64_t,
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 5694ef25c79..628791afef5 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1140,7 +1140,7 @@ class OpTest(unittest.TestCase):
                 else:
                     atol = 2
             else:
-                atol = 1e-2
+                atol = 1e-1
 
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 7dd310d2b88..ca9a489c749 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
         assert_equal(b_g_np_1, b_g_np_2)
 
 
+class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+
+        if dtype == "bfloat16":
+            x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16)
+
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+
+        y_np = y.cast('float32').numpy()
+        x_g_np = x_g.cast('float32').numpy()
+        w_g_np = w_g.cast('float32').numpy()
+        b_g_np = b_g.cast('float32').numpy()
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100):
+            return
+        x_np = np.random.random([10, 20]).astype('float32')
+        weight_np = np.random.random([20]).astype('float32')
+        bias_np = np.random.random([20]).astype('float32')
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, 'float32')
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, 'bfloat16')
+
+        def assert_equal(x, y):
+            self.assertTrue(np.allclose(x, y, atol=1.e-1))
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
 class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
     def test_main(self):
         self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index b20305b78ef..575bc653618 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16):
         self.asvector = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestPnormBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "p_norm"
+        self.init_test_case()
+        self.x = (np.random.random(self.shape) + 0.5).astype(np.float32)
+        self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim,
+                           self.asvector)
+        self.gradient = self.calc_gradient()
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder),
+            'asvector': self.asvector
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.norm)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.dtype = np.uint16
+        self.asvector = False
+
+    def calc_gradient(self):
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder),
+            'asvector': self.asvector
+        }
+        x = self.x
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        asvector = self.attrs["asvector"]
+        x_dtype = x.dtype
+        x = x.astype(np.float32) if x.dtype == np.float16 else x
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            grad = np.power(norm, 1 - porder) * np.power(
+                np.abs(x), porder - 1) * np.sign(x)
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        divisor = numel if asvector else x.shape[axis]
+        numel /= divisor
+        return [grad.astype(x_dtype) * 1 / numel]
+
+
 def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index faa67e1d6da..d246356b4ec 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSumOp_bf16(OpTest):
+    def setUp(self):
+        np.random.seed(100)
+        self.op_type = "reduce_sum"
+        self.dtype = np.uint16
+        self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32)
+        self.attrs = {'dim': [0, 1, 2]}
+        self.out = self.x.sum(axis=tuple(self.attrs['dim']))
+        self.gradient = self.calc_gradient()
+
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.gradient = self.calc_gradient()
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+    def calc_gradient(self):
+        x = self.x
+        grad = np.ones(x.shape, dtype=x.dtype)
+        return [grad]
+
+
 class TestSumOp_fp16_withInt(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-- 
GitLab


From eb7c211a762c0961915c0f9a5d7b0010cd2746e2 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Tue, 1 Mar 2022 11:33:10 +0100
Subject: [PATCH 023/272] Add mobilenetv3_large performance test for bf16 and
 int8  (#39738)

* Add mobilenetv3_large performance test

* Disable the BF16 test if the device does not support BF16 computations

* Change test timeout
---
 .../fluid/inference/tests/api/CMakeLists.txt  | 29 ++++++++++++++++++
 ...er_bfloat16_image_classification_tester.cc | 15 ++++++++--
 ...alyzer_int8_image_classification_tester.cc |  7 ++++-
 .../fluid/inference/tests/api/tester_helper.h | 30 +++++++++++--------
 .../fluid/contrib/slim/tests/CMakeLists.txt   | 11 +++++--
 5 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 37214534f3c..0281fd91765 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -453,6 +453,23 @@ if(WITH_MKLDNN)
   download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10)
 
+ # mobilenetv3_large_x1_0 int8
+ set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large")
+ set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar")
+ if (NOT EXISTS ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME})
+    inference_download_and_uncompress_without_verify(${INT8_MOBILENETV3_LARGE_MODEL_DIR} "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" ${INT8_MOBILENETV3_FILE_NAME})
+ endif()
+ inference_analysis_test_run(test_analyzer_int8_mobilenetv3_large 
+    COMMAND ${INT8_IMG_CLASS_TEST_APP} 
+    ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
+        --infer_data=${IMAGENET_DATA_PATH} 
+        --warmup_batch_size=50
+        --batch_size=1
+        --enable_int8=true 
+        --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} 
+        --iterations=100
+        --with_accuracy_layer=false)
+
   ### BFLOAT16 tests
 
   # build test binary to be used in subsequent tests
@@ -472,6 +489,17 @@ if(WITH_MKLDNN)
   # mobilenetv2 bfloat16
   inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
+  # mobilenetv3_large 
+  inference_analysis_test_run(test_analyzer_bfloat16_mobilenetv3_large
+  COMMAND ${BF16_IMG_CLASS_TEST_APP}
+      ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
+        --infer_data=${IMAGENET_DATA_PATH} 
+        --batch_size=1
+        --enable_bf16=true
+        --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+        --iterations=100
+        --with_accuracy_layer=false)
+
   ### Object detection models
   set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin")
   set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection")
@@ -739,6 +767,7 @@ if(WITH_MKLDNN)
     set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120)
 endif()
 
 set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
index 3b16b0d34fd..f267f0f28d6 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
@@ -14,13 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
+  std::ifstream model_file(FLAGS_infer_model + "/__model__");
+  if (model_file.good())
+    cfg->SetModel(FLAGS_infer_model);
+  else
+    cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel",
+                  FLAGS_infer_model + "/inference.pdiparams");
   cfg->DisableGpu();
   cfg->SwitchIrOptim();
   cfg->SwitchSpecifyInputNames();
@@ -38,7 +44,12 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
   // read data from file and prepare batches with test data
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInputs(&input_slots_all);
-  b_cfg.EnableMkldnnBfloat16();
+  if (FLAGS_enable_bf16 &&
+      platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) {
+    b_cfg.EnableMkldnnBfloat16();
+  } else {
+    FLAGS_enable_bf16 = false;
+  }
   CompareBFloat16AndAnalysis(&cfg, &b_cfg, input_slots_all);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index 8f8b7304423..b07163b518b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -22,7 +22,12 @@ namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
+  std::ifstream model_file(FLAGS_infer_model + "/__model__");
+  if (model_file.good())
+    cfg->SetModel(FLAGS_infer_model);
+  else
+    cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel",
+                  FLAGS_infer_model + "/inference.pdiparams");
   cfg->DisableGpu();
   cfg->SwitchIrOptim();
   cfg->SwitchSpecifyInputNames();
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 637fa16e31b..e63dfd14175 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -213,15 +213,15 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
                     element_in_batch * 3 * 224 * 224,
                 3 * 224 * 224,
                 static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
-
-    std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
-                    element_in_batch,
-                1, static_cast<int64_t *>(labels.data.data()) + i);
+    if (FLAGS_with_accuracy_layer)
+      std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
+                      element_in_batch,
+                  1, static_cast<int64_t *>(labels.data.data()) + i);
   }
-
-  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
+  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(
+      FLAGS_with_accuracy_layer ? 2 : 1);
   (*warmup_data)[0] = std::move(images);
-  (*warmup_data)[1] = std::move(labels);
+  if (FLAGS_with_accuracy_layer) (*warmup_data)[1] = std::move(labels);
   return warmup_data;
 }
 
@@ -254,9 +254,13 @@ void SetInputs(std::vector<std::vector<PaddleTensor>> *inputs,
   }
   for (auto i = 0; i < iterations; i++) {
     auto images = image_reader.NextBatch();
-    auto labels = label_reader.NextBatch();
-    inputs->emplace_back(
-        std::vector<PaddleTensor>{std::move(images), std::move(labels)});
+    std::vector<PaddleTensor> tmp_vec;
+    tmp_vec.push_back(std::move(images));
+    if (FLAGS_with_accuracy_layer) {
+      auto labels = label_reader.NextBatch();
+      tmp_vec.push_back(std::move(labels));
+    }
+    inputs->push_back(std::move(tmp_vec));
   }
 }
 
@@ -825,7 +829,8 @@ void CompareQuantizedAndAnalysis(
   SummarizePerformance("FP32", sample_latency_fp32, "INT8",
                        sample_latency_int8);
 
-  CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
+  if (FLAGS_with_accuracy_layer)
+    CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
 }
 
 void CompareBFloat16AndAnalysis(
@@ -864,7 +869,8 @@ void CompareBFloat16AndAnalysis(
   SummarizePerformance("FP32", sample_latency_fp32, "BF16",
                        sample_latency_bf16);
 
-  CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx);
+  if (FLAGS_with_accuracy_layer)
+    CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx);
 }
 
 void CompareAnalysisAndAnalysis(
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 494ea969797..f75a0fa50a5 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -25,6 +25,12 @@ function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_pa
     _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True)
 endfunction()
 
+function(download_data install_dir url data_file check_sum)
+    if (NOT EXISTS ${install_dir}/${data_file})
+	    inference_download_and_uncompress(${install_dir} ${url} ${data_file} ${check_sum})
+    endif()
+endfunction()
+
 function(download_quant_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
@@ -290,8 +296,9 @@ if(LINUX AND WITH_MKLDNN)
 	### PTQ INT8
 
 	# PTQ int8 lstm model
-	set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz")
-	download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7)
+	set(LSTM_DATA_FILE "quant_lstm_input_data.tar.gz")
+	set(LSTM_URL "${INFERENCE_URL}/int8/unittest_model_data")
+	download_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_URL} ${LSTM_DATA_FILE} add84c754e9b792fea1fbd728d134ab7)
 	set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
 	download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743)
 	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
-- 
GitLab


From 2592805ba0bc121bef82331214cd5d233c08d636 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 1 Mar 2022 18:46:16 +0800
Subject: [PATCH 024/272] Fixed auto codegen for intermediate tensors (#39797)

* Refactored GradNodeAccumulation data structure and behaviour

* Fixed CI issues

* Fix compilation issues

* Fixed minor issues

* Reverted changes for intermediate and OverwriteOutput

* fixed minor issue

* Fixed auto codegen for intermediate tensors

* Removed restriction on AccumulationNode modification

* Fixed CI Coverage issues

* Adjusted Log contents

* Fixed CI issues
---
 paddle/fluid/eager/api/utils/hook_utils.cc    | 63 +++++++++----------
 .../auto_code_generator/eager_generator.cc    | 25 +++++---
 paddle/fluid/eager/backward.cc                |  7 ++-
 paddle/fluid/eager/grad_node_info.cc          |  2 +-
 paddle/fluid/eager/utils.cc                   |  9 +++
 5 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index c7927716300..9abd7be49d4 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -52,49 +52,44 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
   }
 }
 
-static void RetainGradForRegularNode(
-    const paddle::experimental::Tensor& tensor) {
-  AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
-  if (meta->RetainGrads()) {
+void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
+  if (IsLeafTensor(tensor)) {
+    // Leaf tensor's grad will always be retained
+    // Refer to implementation of AccumulationNode for more details
     return;
   } else {
-    meta->SetRetainGrads(true);
-  }
+    AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
+    if (meta->RetainGrads()) {
+      return;
+    } else {
+      meta->SetRetainGrads(true);
+    }
 
-  std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
-      meta->WeakGrad();
+    std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
+        meta->WeakGrad();
 
-  // Define Hook
-  auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
-    if (!weak_grad_tensor.expired()) {
-      auto grad_tensor = weak_grad_tensor.lock();
-      if (t.defined()) {
-        VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
-        // Simply Copy impl() to grad_tensor
-        grad_tensor->set_impl(t.impl());
-        return *grad_tensor.get();
+    // Define Hook
+    auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
+      if (!weak_grad_tensor.expired()) {
+        auto grad_tensor = weak_grad_tensor.lock();
+        if (t.defined()) {
+          VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
+          // Simply Copy impl() to grad_tensor
+          grad_tensor->set_impl(t.impl());
+          return *grad_tensor.get();
+        } else {
+          VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+          return paddle::experimental::Tensor();
+        }
       } else {
         VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
         return paddle::experimental::Tensor();
       }
-    } else {
-      VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
-      return paddle::experimental::Tensor();
-    }
-  };
+    };
 
-  // Append to GradientHooks
-  RegisterGradientHookForTensor(tensor,
-                                std::make_shared<egr::CppTensorHook>(hook));
-}
-
-void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
-  if (IsLeafTensor(tensor)) {
-    // Leaf tensor's grad will always be retained
-    // Refer to implementation of AccumulationNode for more details
-    return;
-  } else {
-    RetainGradForRegularNode(tensor);
+    // Append to GradientHooks
+    RegisterGradientHookForTensor(tensor,
+                                  std::make_shared<egr::CppTensorHook>(hook));
   }
 }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index a8e0ed7a41a..102fad56373 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1156,11 +1156,13 @@ static std::string GenerateGradNodeCreationContent(
       grad_node_creation_str += paddle::string::Sprintf(
           SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
 
-      const char* SET_HISTORY_TEMPLATE =
-          "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
-
+      // Intermediate Tensor does not require SetHistory
+      if (!output.intermediate()) {
+        const char* SET_HISTORY_TEMPLATE =
+            "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+      }
       const char* SET_GRAD_IN_META_TEMPLATE =
           "    grad_node->SetGradInMeta(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
@@ -1173,17 +1175,20 @@ static std::string GenerateGradNodeCreationContent(
       grad_node_creation_str += paddle::string::Sprintf(
           SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
 
-      const char* SET_HISTORY_TEMPLATE =
-          "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
-
+      // Intermediate Tensor does not require SetHistory
+      if (!output.intermediate()) {
+        const char* SET_HISTORY_TEMPLATE =
+            "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+      }
       const char* SET_GRAD_IN_META_TEMPLATE =
           "    grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
     }
 
+    // Intermediate Tensor does not require CheckAndRetainGrad
     if (!output.intermediate()) {
       VLOG(6) << "Generated Call RetainGradForTensor";
       const char* RETAIN_GRAD_TEMPLATE =
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 7073ca8f052..356fdcaf054 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -221,10 +221,11 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
                 << " 's name is: " << grad_output_tensor.name();
 
         auto* next_node = next_node_shared.get();
-
         if (!node_input_buffers_dict.count(next_node)) {
-          node_input_buffers_dict[next_node] =
-              std::make_unique<GradTensorHolder>(next_node->InputMeta());
+          const auto& input_meta = next_node->InputMeta();
+          auto grad_tensor_holder =
+              std::make_unique<GradTensorHolder>(input_meta);
+          node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
         VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
                 << ", rank: " << edge_rank.second;
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 35416281f18..b1189106b8f 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -244,7 +244,7 @@ GradNodeBase::ApplyGradientHooks(
     if (!out.defined() || !out.initialized()) {
       out = (*hook)(tensors[slot_id][rank]);
     } else {
-      // If more than one hook is registered, the input to the next hook func
+      // If more than one hook is registered, the input to the next hook func
       // should be the output of the previous hook
       out = (*hook)(out);
     }
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index a7e5931f1f9..39861c80522 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -122,12 +122,21 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
+    if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
+      VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is "
+                 "detected";
+    }
     autograd_meta->SetGradNode(grad_node);
   }
 }
 
 void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
+  if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
+    VLOG(6)
+        << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected";
+  }
+
   autograd_meta->SetGradNode(grad_node);
 }
 
-- 
GitLab


From 255bf609e5d9289dfc6d5122e7fda746c933b6e2 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 1 Mar 2022 18:48:02 +0800
Subject: [PATCH 025/272] Add function description for Kernel Primitive API
 (#39884)

* Add function description for Kernel Primitive API
1. Set cumsum and sort share memory size = 1024
2.sort and cumsum api limitation : blockDim.x must be less than 512 (blockDim.x <= 512)
---
 .../kernels/primitive/compute_primitives.h    | 284 +++++++++++++-----
 .../primitive/compute_primitives_xpu2.h       |  23 ++
 .../kernels/primitive/datamover_primitives.h  |  32 ++
 .../primitive/datamover_primitives_xpu2.h     |  41 +++
 4 files changed, 311 insertions(+), 69 deletions(-)

diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 4f3c069f3b2..19427551fb3 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -136,7 +136,9 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
   return shared_memory[threadIdx.x];
 }
 
-// Swap data
+/**
+ * @brief Swap data
+ */
 template <typename T>
 __device__ __forceinline__ void Swap(T* first_value, T* second_value) {
   T t_value;
@@ -145,7 +147,9 @@ __device__ __forceinline__ void Swap(T* first_value, T* second_value) {
   (*second_value) = t_value;
 }
 
-// swap with monotonic_type
+/**
+ * @brief Swap data according to  monotonic_type.
+ */
 template <typename T>
 __device__ __forceinline__ void Comparator(T* first_value,
                                            T* second_value,
@@ -155,6 +159,9 @@ __device__ __forceinline__ void Comparator(T* first_value,
   }
 }
 
+/**
+ * @brief Swap data and data index according to  monotonic_type.
+ */
 template <typename T, typename IndexType>
 __device__ __forceinline__ void ComparatorWithIndex(T* first_value,
 
@@ -170,6 +177,18 @@ __device__ __forceinline__ void ComparatorWithIndex(T* first_value,
   }
 }
 
+/**
+ * @brief get the last pow of 2
+ */
+__device__ inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
 }  // namespace details
 
 /**
@@ -453,6 +472,29 @@ __device__ __forceinline__ void Reduce(T* out,
   }
 }
 
+/*
+* @brief Fill register with a constant according to OpFunc
+*
+* @template paraments
+* InT: The data type of in1 and in2.
+* OutT: The data type of out.
+* NX: The number of data columns loaded by each thread.
+* NY: The number of data rows loaded by each thread.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename InT>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()()
+* const {
+*         return a;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<InT>().
+*/
 template <typename InT,
           typename OutT,
           int NX,
@@ -466,6 +508,33 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
   }
 }
 
+/*
+* @brief Get ReturnsCount random data fromm compute according to state, state
+* can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
+* initialized.
+*
+* @template paraments
+* StateType: the type of state, can be curandStatePhilox4_32_10_t or
+* hiprandStatePhilox4_32_10_t.
+* OutT: the type of out register.
+* ReturnsCount: The number of random data generated by OpFunc.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename T>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()(StateType state)
+* const {
+*         return ranomd(state);  // Returns ReturnsCount random numbers with
+* data type T
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<T>().
+*/
+
 template <typename StateType,
           typename OutT,
           int ReturnsCount,
@@ -481,131 +550,208 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
   }
 }
 
-// attention please set share_size = blockDim.x;
-// data and b are the register pointer
-#define shared_size 64
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+/*
+* @brief Complete the prefix and in the block, each thread calculates 2 data,
+* the size of out and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: the type of input register.
+* OutT: the type of out register.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename T>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()(T a, T b)
+* const {
+*         return a + b;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is 2;
+* in: The register pointer of input, the size is 2;
+* compute: Compute function which was declared like OpFunc<T>().
+*/
+
+#define SHARED_SIZE_LIMIT 512
+template <typename InT, typename OutT, int BlockSize, class OpFunc>
 __device__ __forceinline__ void Cumsum(OutT* out,
                                        const InT* in,
                                        OpFunc compute) {
-  __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32];
+  constexpr int kSize = SHARED_SIZE_LIMIT * 2 + (SHARED_SIZE_LIMIT * 2) / 32;
+  __shared__ InT temp[kSize];
+  int stride_size = blockDim.x;
   int tidx = threadIdx.x;
   temp[tidx + tidx / 32] = in[0];
-  temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1];
-  for (int stride = 1; stride <= blockDim.x; stride *= 2) {
+  temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1];
+  for (int stride = 1; stride <= stride_size; stride *= 2) {
     __syncthreads();
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
-      temp[index + index / 32] += temp[index - stride + (index - stride) / 32];
+      temp[index + index / 32] =
+          compute(temp[index + index / 2],
+                  temp[index - stride + (index - stride) / 32]);
     }
   }
   for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) {
     __syncthreads();
     int index = (tidx + 1) * 2 * stride - 1;
     if ((index + stride) < (blockDim.x * 2)) {
-      temp[index + stride + (stride + index) / 32] +=
-          temp[index + (index) / 32];
+      temp[index + stride + (stride + index) / 32] =
+          compute(temp[index + stride + (stride + index) / 32],
+                  temp[index + (index) / 32]);
     }
   }
 
   __syncthreads();
   out[0] = static_cast<OutT>(temp[tidx + tidx / 32]);
   out[1] =
-      static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
+      static_cast<OutT>(temp[tidx + stride_size + (tidx + stride_size) / 32]);
 }
-
-#define SHARED_SIZE_LIMIT \
-  1024  // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
-        // larger than blockDim.x * 2
-// if monotonic_type = 1 then increase
-// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2
-// == 1 the increase
-template <typename T>
-__device__ __forceinline__ void Sort(T* dst,
-                                     const T* src_data,
+#undef SHARED_SIZE_LIMIT
+
+/*
+* @brief Sort data in this block, each thread calculates 2 data, the size of out
+* and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: the type of input register.
+* OutT: the type of out register.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+*
+* @param
+* out: The register pointer of out, the size is 2.
+* in: The register pointer of input, the size is 2.
+* num: The num of this block
+* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+* sorted in escending.
+*/
+#define SHARED_SIZE_LIMIT 1024
+// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
+// larger than blockDim.x * 2
+template <typename InT, typename OutT, int BlockSize>
+__device__ __forceinline__ void Sort(OutT* out,
+                                     const InT* in,
                                      int num,
                                      int monotonic_type) {
-  // todo: set  num = Pow2(num)
+  int upper_bound = blockDim.x;
+  // update upper_bound
+  upper_bound = std::min(details::GetLastPow2(num), upper_bound);
   // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
-  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
-                                          // blockDim * 2
-  // Copy value and index from src and src_index
-  value[threadIdx.x] = src_data[0];
-  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  __shared__ InT value[SHARED_SIZE_LIMIT];
+  int stride_size = blockDim.x;
+  // shareMem's size must larger than blockDim * 2
+  // Copy value from in
+  value[threadIdx.x] = in[0];
+  value[threadIdx.x + stride_size] = in[1];
   // make bitonicSort
-  for (int size = 2; size < num; size <<= 1) {
+  for (int size = 2; size < upper_bound; size <<= 1) {
     int bitonic_type = (threadIdx.x & (size / 2)) != 0;
     for (int stride = size / 2; stride > 0; stride >>= 1) {
       __syncthreads();
       int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      details::Comparator<T>(&value[pos], &value[pos + stride], bitonic_type);
+      details::Comparator<InT>(&value[pos], &value[pos + stride], bitonic_type);
     }
   }
   // last sort
-  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+  for (int stride = stride_size; stride > 0; stride >>= 1) {
     __syncthreads();
     int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     // last sort when monotonic_type = 1 then increase
-    details::Comparator<T>(&value[pos], &value[pos + stride], monotonic_type);
+    details::Comparator<InT>(&value[pos], &value[pos + stride], monotonic_type);
   }
   __syncthreads();
-  dst[0] = value[threadIdx.x];
-  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  out[0] = static_cast<OutT>(value[threadIdx.x]);
+  out[1] = static_cast<OutT>(value[threadIdx.x + stride_size]);
 }
 
-template <typename T, typename IndexType>
-__device__ __forceinline__ void Sort(T* dst,
-                                     IndexType* dst_index,
-                                     const T* src_data,
-                                     IndexType* src_index,
+/*
+* @brief Sort data with data_index in this block, each thread calculates 2 data,
+* the size of out and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: The type of input register.
+* OutT: The type of out register.
+* IndexType: The type of index.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+*
+* @param
+* out: The register pointer of out, the size is 2.
+* out_index: The register pointer of out_index, the size is 2.
+* in: The register pointer of input, the size is 2.
+* in_index: The register pointer of in_index, the size is 2.
+* num: The num of this block.
+* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+* sorted in escending.
+*/
+template <typename InT, typename OutT, typename IndexType, int BlockSize>
+__device__ __forceinline__ void Sort(OutT* out,
+                                     IndexType* out_index,
+                                     const InT* in,
+                                     IndexType* in_index,
                                      int num,
                                      int monotonic_type) {
-  // todo: set  num = Pow2(num)
+  int upper_bound = blockDim.x;
+  // update upper_bound
+  upper_bound = std::min(details::GetLastPow2(num), upper_bound);
   // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
-  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
-                                          // blockDim * 2
+  __shared__ InT value[SHARED_SIZE_LIMIT];
+  // shareMem's size must larger than blockDim * 2
   __shared__ IndexType index[SHARED_SIZE_LIMIT];
-  // Copy value and index from src and src_index
-  value[threadIdx.x] = src_data[0];
-  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // Copy value and index from in and in_index
+  int stride_size = blockDim.x;
+  value[threadIdx.x] = in[0];
+  value[threadIdx.x + stride_size] = in[1];
   // index
-  index[threadIdx.x] = src_index[0];
-  index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1];
+  index[threadIdx.x] = in_index[0];
+  index[threadIdx.x + stride_size] = in_index[1];
   // make bitonicSort
-  for (int size = 2; size < num; size <<= 1) {
+  for (int size = 2; size < upper_bound; size <<= 1) {
     int bitonic_type = (threadIdx.x & (size / 2)) != 0;
     for (int stride = size / 2; stride > 0; stride >>= 1) {
       __syncthreads();
       int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      details::ComparatorWithIndex<T, IndexType>(&value[pos],
-                                                 &value[pos + stride],
-                                                 &index[pos],
-                                                 &index[pos + stride],
-                                                 bitonic_type);
+      details::ComparatorWithIndex<InT, IndexType>(&value[pos],
+                                                   &value[pos + stride],
+                                                   &index[pos],
+                                                   &index[pos + stride],
+                                                   bitonic_type);
     }
   }
 
-  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+  for (int stride = stride_size; stride > 0; stride >>= 1) {
     __syncthreads();
     int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     // last sort when monotonic_type = 1 then increase
-    details::ComparatorWithIndex<T, IndexType>(&value[pos],
-                                               &value[pos + stride],
-                                               &index[pos],
-                                               &index[pos + stride],
-                                               monotonic_type);
+    details::ComparatorWithIndex<InT, IndexType>(&value[pos],
+                                                 &value[pos + stride],
+                                                 &index[pos],
+                                                 &index[pos + stride],
+                                                 monotonic_type);
   }
 
   __syncthreads();
-  dst[0] = value[threadIdx.x];
-  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-  dst_index[0] = index[threadIdx.x];
-  dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  out[0] = static_cast<OutT>(value[threadIdx.x]);
+  out[1] = static_cast<OutT>(value[threadIdx.x + stride_size]);
+  out_index[0] = index[threadIdx.x];
+  out_index[1] = index[threadIdx.x + stride_size];
+}
+
+template <typename T1, typename T2, typename OutT, typename OpFunc>
+HOSTDEVICE __forceinline__ void OperatorTernary(
+    OutT* out, const T1* in1, const T2* in2, OpFunc func, int num) {
+  func(out, in1, in2, num);
+}
+
+template <typename InT, typename OutT, typename OpFunc>
+HOSTDEVICE __forceinline__ void OperatorBinary(OutT* out,
+                                               const InT* in,
+                                               OpFunc func,
+                                               int num) {
+  func(out, in, num);
 }
 
 }  // namespace kps
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index a445f4a02ea..1f4ef2ed932 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -348,6 +348,29 @@ __device__ __forceinline__ void Reduce(T* out,
   }
 }
 
+/*
+* @brief Fill register with a constant according to OpFunc
+*
+* @template paraments
+* InT: The data type of in1 and in2.
+* OutT: The data type of out.
+* NX: The number of data columns loaded by each thread.
+* NY: The number of data rows loaded by each thread.
+* BlockSize: Identifies the current device thread index method. For xpu,
+* core_id() is used as the index.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename InT>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()()
+* const {
+*         return a;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<InT>().
+*/
 template <typename InT,
           typename OutT,
           int NX,
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index a6c4c40a750..2f1e2f589c5 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -297,6 +297,24 @@ __device__ __forceinline__ void ReadData(T* dst,
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T,
           int NX,
@@ -714,6 +732,20 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Initialize register with data index.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ * NY: Number of data to initialize, NY only can be 1.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
 template <typename T, int NX, int NY, int BlockSize>
 __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
   int thread_offset = block_offset + threadIdx.x * NX;
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 75b2dbaf7e6..53a8b7d0c9e 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -244,6 +244,24 @@ __device__ __inline__ void ReadData(T* dst,
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T,
           int NX,
@@ -646,5 +664,28 @@ __device__ __inline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Initialize register with data index.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ * NY: Number of data to initialize, NY only can be 1.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
+  int thread_offset = block_offset + core_id() * NX;
+#pragma unroll
+  for (int nx = 0; nx < NX; ++nx) {
+    dst[nx] = static_cast<T>(thread_offset + nx);
+  }
+}
+
 }  // namespace kps
 }  // namespace phi
-- 
GitLab


From 197da15ae4a5a127d1ce1208e2bed4bab05f836a Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Tue, 1 Mar 2022 19:00:30 +0800
Subject: [PATCH 026/272] [phi] tranfer the selu_op and pass the CI (#39819)

* tranfer the selu_op and pass the CI

* add sig files

* fix code

* fix by code review

* remove TOOD

* change the include position

* change the head position
---
 paddle/fluid/operators/selu_op.cc             |   8 --
 paddle/fluid/operators/selu_op.cu             |  22 ----
 paddle/fluid/operators/selu_op.h              | 123 ------------------
 paddle/phi/kernels/cpu/selu_grad_kernel.cc    |  21 +++
 paddle/phi/kernels/cpu/selu_kernel.cc         |  21 +++
 paddle/phi/kernels/gpu/selu_grad_kernel.cu    |  22 ++++
 paddle/phi/kernels/gpu/selu_kernel.cu         |  21 +++
 .../phi/kernels/impl/selu_grad_kernel_impl.h  |  35 +++++
 paddle/phi/kernels/impl/selu_kernel_impl.h    |  88 +++++++++++++
 paddle/phi/kernels/selu_grad_kernel.h         |  29 +++++
 paddle/phi/kernels/selu_kernel.h              |  28 ++++
 paddle/phi/ops/compat/selu_sig.cc             |  28 ++++
 12 files changed, 293 insertions(+), 153 deletions(-)
 delete mode 100644 paddle/fluid/operators/selu_op.cu
 delete mode 100644 paddle/fluid/operators/selu_op.h
 create mode 100644 paddle/phi/kernels/cpu/selu_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/selu_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/selu_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/selu_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/selu_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/selu_kernel_impl.h
 create mode 100644 paddle/phi/kernels/selu_grad_kernel.h
 create mode 100644 paddle/phi/kernels/selu_kernel.h
 create mode 100644 paddle/phi/ops/compat/selu_sig.cc

diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 0adf61d7ce3..88ef1f3ea4a 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/selu_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -127,9 +125,3 @@ REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
                   ops::SeluGradMaker<paddle::framework::OpDesc>,
                   ops::SeluGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    selu, ops::SeluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SeluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    selu_grad, ops::SeluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SeluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/selu_op.cu b/paddle/fluid/operators/selu_op.cu
deleted file mode 100644
index fb3245ab760..00000000000
--- a/paddle/fluid/operators/selu_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/selu_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    selu, ops::SeluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SeluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    selu_grad, ops::SeluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SeluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h
deleted file mode 100644
index b2fc834c42f..00000000000
--- a/paddle/fluid/operators/selu_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct SeluFunctor {
-  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
-      : x_data_ptr_(x_data_ptr),
-        alpha_(alpha),
-        scale_(scale),
-        y_data_ptr_(y_data_ptr) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T x_ele = x_data_ptr_[idx];
-    if (x_ele <= 0) {
-      x_ele = alpha_ * real_exp(x_ele) - alpha_;
-    }
-    y_data_ptr_[idx] = scale_ * x_ele;
-  }
-  const T* x_data_ptr_;
-  const float alpha_;
-  const float scale_;
-  T* y_data_ptr_;
-};
-
-template <typename T>
-struct SeluGradFunctor {
-  SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha,
-                  float scale, T* dx_data_ptr)
-      : y_data_ptr_(y_data_ptr),
-        dy_data_ptr_(dy_data_ptr),
-        alpha_(alpha),
-        scale_(scale),
-        la_(alpha * scale),
-        dx_data_ptr_(dx_data_ptr) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T y_ele = y_data_ptr_[idx];
-    T dy_ele = dy_data_ptr_[idx];
-
-    float tmp = scale_;
-    if (y_ele <= 0) {
-      tmp = y_ele + la_;
-    }
-    dx_data_ptr_[idx] = dy_ele * tmp;
-  }
-  const T* y_data_ptr_;
-  const T* dy_data_ptr_;
-  const float alpha_;
-  const float scale_;
-  const float la_;
-  T* dx_data_ptr_;
-};
-
-template <typename DeviceContext, typename T>
-class SeluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-
-    float alpha = context.Attr<float>("alpha");
-    float scale = context.Attr<float>("scale");
-
-    auto out_ptr = out->mutable_data<T>(context.GetPlace());
-
-    SeluFunctor<T> functor(x->data<T>(), alpha, scale, out_ptr);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(x->numel());
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SeluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-
-    auto* out = context.Input<Tensor>("Out");
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-
-    float alpha = context.Attr<float>("alpha");
-    float scale = context.Attr<float>("scale");
-
-    auto dx_ptr = dx->mutable_data<T>(context.GetPlace());
-
-    SeluGradFunctor<T> functor(out->data<T>(), dout->data<T>(), alpha, scale,
-                               dx_ptr);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(out->numel());
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/selu_grad_kernel.cc b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
new file mode 100644
index 00000000000..32101b19132
--- /dev/null
+++ b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    selu_grad, CPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/selu_kernel.cc b/paddle/phi/kernels/cpu/selu_kernel.cc
new file mode 100644
index 00000000000..bc5a0616a72
--- /dev/null
+++ b/paddle/phi/kernels/cpu/selu_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+
+PD_REGISTER_KERNEL(selu, CPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
new file mode 100644
index 00000000000..0ed299413c1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/selu_kernel.cu b/paddle/phi/kernels/gpu/selu_kernel.cu
new file mode 100644
index 00000000000..99303d8c18a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/selu_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+
+PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
new file mode 100644
index 00000000000..d09c87b0a4e
--- /dev/null
+++ b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out,
+                    const DenseTensor& dout,
+                    float scale,
+                    float alpha,
+                    DenseTensor* dx) {
+  auto dx_ptr = dev_ctx.template Alloc<T>(dx);
+  SeluGradFunctor<T> functor(
+      out.data<T>(), dout.data<T>(), alpha, scale, dx_ptr);
+  size_t limit = static_cast<size_t>(out.numel());
+  paddle::platform::ForRange<Context> for_range(dev_ctx, limit);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h
new file mode 100644
index 00000000000..888bac42bfd
--- /dev/null
+++ b/paddle/phi/kernels/impl/selu_kernel_impl.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/operators/math.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T>
+struct SeluFunctor {
+  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
+      : x_data_ptr_(x_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        y_data_ptr_(y_data_ptr) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T x_ele = x_data_ptr_[idx];
+    if (x_ele <= 0) {
+      x_ele = alpha_ * paddle::operators::real_exp(x_ele) - alpha_;
+    }
+    y_data_ptr_[idx] = scale_ * x_ele;
+  }
+  const T* x_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  T* y_data_ptr_;
+};
+
+template <typename T>
+struct SeluGradFunctor {
+  SeluGradFunctor(const T* y_data_ptr,
+                  const T* dy_data_ptr,
+                  float alpha,
+                  float scale,
+                  T* dx_data_ptr)
+      : y_data_ptr_(y_data_ptr),
+        dy_data_ptr_(dy_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        la_(alpha * scale),
+        dx_data_ptr_(dx_data_ptr) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T y_ele = y_data_ptr_[idx];
+    T dy_ele = dy_data_ptr_[idx];
+
+    float tmp = scale_;
+    if (y_ele <= 0) {
+      tmp = y_ele + la_;
+    }
+    dx_data_ptr_[idx] = dy_ele * tmp;
+  }
+  const T* y_data_ptr_;
+  const T* dy_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  const float la_;
+  T* dx_data_ptr_;
+};
+
+template <typename T, typename Context>
+void SeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                float scale,
+                float alpha,
+                DenseTensor* out) {
+  auto out_ptr = dev_ctx.template Alloc<T>(out);
+  SeluFunctor<T> functor(x.data<T>(), alpha, scale, out_ptr);
+  size_t limit = static_cast<size_t>(x.numel());
+  paddle::platform::ForRange<Context> for_range(dev_ctx, limit);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/selu_grad_kernel.h b/paddle/phi/kernels/selu_grad_kernel.h
new file mode 100644
index 00000000000..42cde6deabe
--- /dev/null
+++ b/paddle/phi/kernels/selu_grad_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out,
+                    const DenseTensor& d_out,
+                    float scale,
+                    float alpha,
+                    DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/selu_kernel.h b/paddle/phi/kernels/selu_kernel.h
new file mode 100644
index 00000000000..cd5d27e98cc
--- /dev/null
+++ b/paddle/phi/kernels/selu_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                float scale,
+                float alpha,
+                DenseTensor* out);
+}  // phi
diff --git a/paddle/phi/ops/compat/selu_sig.cc b/paddle/phi/ops/compat/selu_sig.cc
new file mode 100644
index 00000000000..23f5cc34515
--- /dev/null
+++ b/paddle/phi/ops/compat/selu_sig.cc
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SeluGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("selu_grad",
+                         {"Out", GradVarName("Out")},
+                         {"scale", "alpha"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(selu_grad, phi::SeluGradGradOpArgumentMapping);
-- 
GitLab


From 090396368c80360fc33d09dfb1df7492f7dfb544 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 1 Mar 2022 19:23:04 +0800
Subject: [PATCH 027/272] [Phi]rm reduce infershape (#39820)

* modify infershape utils and rm reduce infershape

* merge develop

* fix infermete bug

* add IsForInferShape func in ArgumentMappingContext

* add reduce_mean infermeta

* modify annotation

* add default dims
---
 paddle/fluid/framework/infershape_utils.cc    |  6 +-
 paddle/fluid/framework/operator.h             |  2 +
 .../operators/reduce_ops/reduce_mean_op.cc    | 10 +++-
 .../operators/reduce_ops/reduce_sum_op.cc     | 10 +++-
 .../dialect/phi/pass/proto_arg_map_context.h  |  2 +
 paddle/phi/core/compat/arg_map_context.h      |  4 ++
 paddle/phi/infermeta/unary.cc                 | 60 +++++++++++++++----
 paddle/phi/infermeta/unary.h                  | 15 +++--
 paddle/phi/kernels/math_kernel.h              |  2 +-
 paddle/phi/ops/compat/reduce_sig.cc           | 34 +++++++----
 paddle/phi/tests/ops/test_op_signature.h      |  2 +
 python/paddle/utils/code_gen/api.yaml         |  2 +-
 12 files changed, 117 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index d9287b9a624..57fb68e8042 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -88,6 +88,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsForInferShape() const override { return true; }
+
  private:
   const InferShapeContext& ctx_;
 };
@@ -127,7 +129,9 @@ class CompatMetaTensor : public phi::MetaTensor {
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
-      return phi::make_ddim(var->GetShape());
+
+      return var->GetShape().empty() ? phi::make_ddim({0UL})
+                                     : phi::make_ddim(var->GetShape());
     }
   }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 16718a31651..e33d4feb82a 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -489,6 +489,8 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return ctx_.OutputVar(name)->IsType<phi::SelectedRows>();
   }
 
+  bool IsForInferShape() const override { return false; }
+
  private:
   const ExecutionContext& ctx_;
 };
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index e80df5f95bb..6157a3a925d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -18,6 +18,10 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -92,9 +96,13 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
 };
 
+DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
+                            PT_INFER_META(phi::MeanRawInferMeta));
+
 REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
                   ops::ReduceMeanOpGradMaker<paddle::framework::OpDesc>,
-                  ops::ReduceMeanOpGradMaker<paddle::imperative::OpBase>);
+                  ops::ReduceMeanOpGradMaker<paddle::imperative::OpBase>,
+                  ReduceMeanInferShapeFunctor);
 REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index bdab14a18a0..8ef0712dc7a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -16,6 +16,10 @@
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -98,10 +102,14 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_sum"; }
 };
 
+DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
+                            PT_INFER_META(phi::ReduceInferMetaBase));
+
 REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumVarTypeInference,
                   ops::ReduceSumOpGradMaker<paddle::framework::OpDesc>,
-                  ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>);
+                  ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>,
+                  ReduceSumInferShapeFunctor);
 REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index 843b19d217f..ca8a22a7e75 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -46,6 +46,8 @@ class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
   bool IsDenseTensorOutput(const std::string& name) const override;
   bool IsSelectedRowsOutput(const std::string& name) const override;
 
+  bool IsForInferShape() const override { return false; }
+
  private:
   mlir::Operation* op_;
   const std::unordered_map<std::string, uint8_t>& input_map_;
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index af29b3bab5c..f625d57df2e 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -91,6 +91,10 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
+
+  // use this function to mark it comes from InferShapeArgumentMappingContext
+  // and will be used in infershape
+  virtual bool IsForInferShape() const = 0;
 };
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 4696187bd23..983e0162264 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -375,7 +375,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
   ReshapeInferMeta(x, shape, out, config);
 }
 
-/*  Why not use ReduceInferMeta directly?
+/*  Why not use ReduceInferMetaBase directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
 void SumInferMeta(const MetaTensor& x,
@@ -383,22 +383,53 @@ void SumInferMeta(const MetaTensor& x,
                   DataType dtype,
                   bool keep_dim,
                   MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, dtype, out);
+  bool reduce_all = false;
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out);
 }
 
 void ReduceInferMetaBase(const MetaTensor& x,
                          const std::vector<int64_t>& axis,
                          bool keep_dim,
+                         bool reduce_all,
                          DataType dtype,
                          MetaTensor* out) {
-  bool reduce_all = true;
-  std::set<int64_t> dims_set(axis.begin(), axis.end());
+  auto x_rank = x.dims().size();
+
+  std::vector<int64_t> formated_axis = axis;
+  for (size_t i = 0; i < axis.size(); ++i) {
+    PADDLE_ENFORCE_LT(axis[i],
+                      x_rank,
+                      errors::InvalidArgument(
+                          "The reduce dim index %d should be in the "
+                          "range [-dimension(X), dimension(X)] "
+                          "which dimesion = %d. But received dim index = %d.",
+                          i,
+                          x_rank,
+                          axis[i]));
+    PADDLE_ENFORCE_GE(axis[i],
+                      -x_rank,
+                      errors::InvalidArgument(
+                          "The reduce dim index %d should be in the "
+                          "range [-dimension(X), dimension(X)] "
+                          "which dimesion = %d. But received dim index = %d.",
+                          i,
+                          x_rank,
+                          axis[i]));
+
+    if (axis[i] < 0) {
+      formated_axis[i] = axis[i] + x_rank;
+    }
+  }
+
+  bool full_dim = true;
+  std::set<int64_t> dims_set(formated_axis.begin(), formated_axis.end());
   for (int64_t i = 0; i < x.dims().size(); ++i) {
     if (dims_set.find(i) == dims_set.end()) {
-      reduce_all = false;
+      full_dim = false;
       break;
     }
   }
+  reduce_all = reduce_all || full_dim;
 
   std::vector<int64_t> out_dim_vector;
   if (keep_dim) {
@@ -441,11 +472,20 @@ void ReduceInferMetaBase(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
-void ReduceInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, DataType::UNDEFINED, out);
+void MeanRawInferMeta(const MetaTensor& x,
+                      const std::vector<int64_t>& axis,
+                      bool keep_dim,
+                      bool reduce_all,
+                      MetaTensor* out) {
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+}
+
+void MeanInferMeta(const MetaTensor& x,
+                   const std::vector<int64_t>& axis,
+                   bool keep_dim,
+                   MetaTensor* out) {
+  bool reduce_all = false;
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
 }
 
 void TransferLayoutInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index b3929b9d2b4..a2d779e0f70 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -86,13 +86,20 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
 void ReduceInferMetaBase(const MetaTensor& x,
                          const std::vector<int64_t>& axis,
                          bool keep_dim,
+                         bool reduce_all,
                          DataType dtype,
                          MetaTensor* out);
 
-void ReduceInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     MetaTensor* out);
+void MeanRawInferMeta(const MetaTensor& x,
+                      const std::vector<int64_t>& axis,
+                      bool keep_dim,
+                      bool reduce_all,
+                      MetaTensor* out);
+
+void MeanInferMeta(const MetaTensor& x,
+                   const std::vector<int64_t>& axis,
+                   bool keep_dim,
+                   MetaTensor* out);
 
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index c6036f4a042..342393d79bd 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx,
                  bool keep_dim) {
   auto dense_out = phi::Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
-  ReduceInferMetaBase(x, axis, keep_dim, x.dtype(), &meta_out);
+  ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 74704671f8b..6395486ed2b 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -17,28 +17,36 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
   if (ctx.IsDenseTensorInput("X")) {
-    if (!reduce_all) {
-      return KernelSignature(
-          "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"});
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "sum_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "sum_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature("sum_raw",
+                             {"X"},
+                             {"dim", "keep_dim", "reduce_all", "out_dtype"},
+                             {"Out"});
     }
-    return KernelSignature("sum_raw",
-                           {"X"},
-                           {"dim", "keep_dim", "reduce_all", "out_dtype"},
-                           {"Out"});
+    return KernelSignature(
+        "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"});
   }
   return KernelSignature("unregistered", {}, {}, {});
 }
 
 KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
   if (ctx.IsDenseTensorInput("X")) {
-    if (!reduce_all) {
-      return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"});
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "mean_raw" KernelSignature.
+    // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the
+    // "mean_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
     }
-    return KernelSignature(
-        "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"});
   }
   return KernelSignature("unregistered", {}, {}, {});
 }
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index fcd2d397fa2..06048f33d94 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -80,6 +80,8 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_outputs.count(name) > 0;
   }
 
+  bool IsForInferShape() const override { return false; }
+
  private:
   const std::unordered_set<std::string> dense_tensor_inputs;
   const std::unordered_set<std::string> selected_rows_inputs;
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 7ea8493b67f..45a6aae5e6d 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -124,7 +124,7 @@
   args : (Tensor x, int64_t[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
-    func : ReduceInferMeta
+    func : MeanInferMeta
   kernel :
     func : mean
 
-- 
GitLab


From 69ab270021c51ce70345f484e52eadb5165b9c54 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Tue, 1 Mar 2022 20:11:33 +0800
Subject: [PATCH 028/272] fix compiling and running with ipu (#39920)

---
 paddle/fluid/framework/phi_utils.cc           |   9 +
 .../fluid/platform/device/ipu/ipu_strategy.cc | 306 ++++++++++--------
 .../fluid/platform/device/ipu/ipu_strategy.h  |  72 +++--
 paddle/fluid/pybind/pybind.cc                 |   2 +
 .../fluid/tests/unittests/ipu/CMakeLists.txt  |   8 +
 5 files changed, 231 insertions(+), 166 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt

diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 93bc2c02d57..14997dd9610 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -125,6 +125,15 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
   }
+#endif
+#ifdef PADDLE_WITH_IPU
+  if (platform::is_ipu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "pten missing IPU kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
+                          kernel_key.dtype());
+  }
 #endif
   return phi::KernelKey();
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 943dfcc6cff..e806b0b30e4 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -120,121 +120,151 @@ IpuStrategy::IpuStrategy() {
   RegisterGetter(options_getter, options_type, #name, "string",         \
                  [&]() { return popart_options.aliased_name; })
 
-#define ADD_POPART_ENUM_OPTION(name, EnumType) \
-  ADD_POPART_ENUM_OPTION_ALIAS(name, name, EnumType)
-
-#define ADD_POPART_BOOL_OPTION(name) ADD_POPART_BOOL_OPTION_ALIAS(name, name)
-
-#define ADD_POPART_UINT64_OPTION(name) \
-  ADD_POPART_UINT64_OPTION_ALIAS(name, name)
-
-#define ADD_POPART_DOUBLE_OPTION(name) \
-  ADD_POPART_DOUBLE_OPTION_ALIAS(name, name)
-
-#define ADD_POPART_STRING_OPTION(name) \
-  ADD_POPART_STRING_OPTION_ALIAS(name, name)
-
-  ADD_POPART_ENUM_OPTION(autodiffSettings.stitchStrategy,
-                         AutodiffStitchStrategy);
-  ADD_POPART_ENUM_OPTION(batchSerializationSettings.transformContext,
-                         BatchSerializationTransformContext);
-  ADD_POPART_ENUM_OPTION(batchSerializationSettings.method,
-                         BatchSerializationMethod);
-  ADD_POPART_ENUM_OPTION(batchSerializationSettings.batchSchedule,
-                         BatchSerializationBatchSchedule);
-  ADD_POPART_ENUM_OPTION(autoRecomputation, RecomputationType);
-  ADD_POPART_ENUM_OPTION(mergeVarUpdate, MergeVarUpdateType);
-  ADD_POPART_ENUM_OPTION(virtualGraphMode, VirtualGraphMode);
-  ADD_POPART_ENUM_OPTION(syntheticDataMode, SyntheticDataMode);
-  ADD_POPART_ENUM_OPTION(subgraphCopyingStrategy, SubgraphCopyingStrategy);
-  ADD_POPART_ENUM_OPTION(accumulationAndReplicationReductionType,
-                         ReductionType);
-  ADD_POPART_ENUM_OPTION(meanAccumulationAndReplicationReductionStrategy,
-                         MeanReductionStrategy);
-
-  ADD_POPART_STRING_OPTION(logDir);
-  ADD_POPART_STRING_OPTION(cachePath);
-  ADD_POPART_STRING_OPTION(partialsTypeMatMuls);
-  ADD_POPART_STRING_OPTION(customCodeletCompileFlags);
-  ADD_POPART_STRING_OPTION(serializedPoprithmsShiftGraphsDir);
-  ADD_POPART_STRING_OPTION(kahnTieBreaker);
-
-  ADD_POPART_UINT64_OPTION(executionPhaseSettings.phases);
-  ADD_POPART_UINT64_OPTION(executionPhaseSettings.stages);
-  ADD_POPART_UINT64_OPTION(batchSerializationSettings.factor);
-  ADD_POPART_UINT64_OPTION(firstDotOp);
-  ADD_POPART_UINT64_OPTION(finalDotOp);
-  ADD_POPART_UINT64_OPTION(numIOTiles);
-  ADD_POPART_UINT64_OPTION(mergeVarUpdateMemThreshold);
-  ADD_POPART_UINT64_OPTION(looseThresholdAtPeak);
-  ADD_POPART_UINT64_OPTION(accumulationFactor);
-  ADD_POPART_UINT64_OPTION(swapLimitScheduler);
-  ADD_POPART_UINT64_OPTION(globalReplicationFactor);
-  ADD_POPART_UINT64_OPTION(globalReplicaOffset);
-  ADD_POPART_UINT64_OPTION(defaultPrefetchBufferingDepth);
-  ADD_POPART_UINT64_OPTION(compilationProgressTotal);
-  ADD_POPART_UINT64_OPTION(transitiveClosureOptimizationThreshold);
-
-  ADD_POPART_BOOL_OPTION(batchSerializationSettings.concatOnVirtualGraphChange);
-  ADD_POPART_BOOL_OPTION(
+  ADD_POPART_ENUM_OPTION_ALIAS(autodiff_settings.stitch_strategy,
+                               autodiffSettings.stitchStrategy,
+                               AutodiffStitchStrategy);
+  ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.transform_context,
+                               batchSerializationSettings.transformContext,
+                               BatchSerializationTransformContext);
+  ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.method,
+                               batchSerializationSettings.method,
+                               BatchSerializationMethod);
+  ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.batch_schedule,
+                               batchSerializationSettings.batchSchedule,
+                               BatchSerializationBatchSchedule);
+  ADD_POPART_ENUM_OPTION_ALIAS(auto_recomputation, autoRecomputation,
+                               RecomputationType);
+  ADD_POPART_ENUM_OPTION_ALIAS(merge_var_update, mergeVarUpdate,
+                               MergeVarUpdateType);
+  ADD_POPART_ENUM_OPTION_ALIAS(virtual_graph_mode, virtualGraphMode,
+                               VirtualGraphMode);
+  ADD_POPART_ENUM_OPTION_ALIAS(synthetic_data_mode, syntheticDataMode,
+                               SyntheticDataMode);
+  ADD_POPART_ENUM_OPTION_ALIAS(subgraph_copying_strategy,
+                               subgraphCopyingStrategy,
+                               SubgraphCopyingStrategy);
+  ADD_POPART_ENUM_OPTION_ALIAS(accumulation_and_replication_reduction_type,
+                               accumulationAndReplicationReductionType,
+                               ReductionType);
+  ADD_POPART_ENUM_OPTION_ALIAS(
+      mean_accumulation_and_replication_reduction_strategy,
+      meanAccumulationAndReplicationReductionStrategy, MeanReductionStrategy);
+
+  ADD_POPART_STRING_OPTION_ALIAS(log_dir, logDir);
+  ADD_POPART_STRING_OPTION_ALIAS(cache_path, cachePath);
+  ADD_POPART_STRING_OPTION_ALIAS(partials_type_matmuls, partialsTypeMatMuls);
+  ADD_POPART_STRING_OPTION_ALIAS(custom_codelet_compile_flags,
+                                 customCodeletCompileFlags);
+  ADD_POPART_STRING_OPTION_ALIAS(serialized_poprithms_shift_graphs_dir,
+                                 serializedPoprithmsShiftGraphsDir);
+  ADD_POPART_STRING_OPTION_ALIAS(kahn_tie_breaker, kahnTieBreaker);
+
+  ADD_POPART_UINT64_OPTION_ALIAS(execution_phase_settings.phases,
+                                 executionPhaseSettings.phases);
+  ADD_POPART_UINT64_OPTION_ALIAS(execution_phase_settings.stages,
+                                 executionPhaseSettings.stages);
+  ADD_POPART_UINT64_OPTION_ALIAS(batch_serialization_settings.factor,
+                                 batchSerializationSettings.factor);
+  ADD_POPART_UINT64_OPTION_ALIAS(first_dot_op, firstDotOp);
+  ADD_POPART_UINT64_OPTION_ALIAS(final_dot_op, finalDotOp);
+  ADD_POPART_UINT64_OPTION_ALIAS(num_io_tiles, numIOTiles);
+  ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold,
+                                 mergeVarUpdateMemThreshold);
+  ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak);
+  ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor);
+  ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler);
+  ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor,
+                                 globalReplicationFactor);
+  ADD_POPART_UINT64_OPTION_ALIAS(global_replica_offset, globalReplicaOffset);
+  ADD_POPART_UINT64_OPTION_ALIAS(default_prefetch_buffering_depth,
+                                 defaultPrefetchBufferingDepth);
+  ADD_POPART_UINT64_OPTION_ALIAS(compilation_progress_total,
+                                 compilationProgressTotal);
+  ADD_POPART_UINT64_OPTION_ALIAS(transitive_closure_optimization_threshold,
+                                 transitiveClosureOptimizationThreshold);
+
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      batch_serialization_settings.concat_on_virtual_graph_change,
+      batchSerializationSettings.concatOnVirtualGraphChange);
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      batch_serialization_settings.concat_on_execution_phase_change,
       batchSerializationSettings.concatOnExecutionPhaseChange);
-  ADD_POPART_BOOL_OPTION(
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      batch_serialization_settings.concat_on_pipeline_stage_change,
       batchSerializationSettings.concatOnPipelineStageChange);
-  ADD_POPART_BOOL_OPTION(strictOpVersions);
-  ADD_POPART_BOOL_OPTION(opxAliasChecking);
-  ADD_POPART_BOOL_OPTION(opxModifyChecking);
-  ADD_POPART_BOOL_OPTION(dotOpNames);
-  ADD_POPART_BOOL_OPTION(exportPoplarComputationGraph);
-  ADD_POPART_BOOL_OPTION(exportPoplarVertexGraph);
-  ADD_POPART_BOOL_OPTION(separateCallOpPdfs);
-  ADD_POPART_BOOL_OPTION(enableOutlining);
-  ADD_POPART_BOOL_OPTION(enableOutliningCopyCostPruning);
-  ADD_POPART_BOOL_OPTION(rearrangeAnchorsOnHost);
-  ADD_POPART_BOOL_OPTION(enablePrefetchDatastreams);
-  ADD_POPART_BOOL_OPTION(enableNonStableSoftmax);
-  ADD_POPART_BOOL_OPTION(enableReplicatedGraphs);
-  ADD_POPART_BOOL_OPTION(enableGradientAccumulation);
-  ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter);
-  ADD_POPART_BOOL_OPTION(enablePipelining);
+  ADD_POPART_BOOL_OPTION_ALIAS(strict_op_versions, strictOpVersions);
+  ADD_POPART_BOOL_OPTION_ALIAS(opx_alias_checking, opxAliasChecking);
+  ADD_POPART_BOOL_OPTION_ALIAS(opx_modify_checking, opxModifyChecking);
+  ADD_POPART_BOOL_OPTION_ALIAS(dot_op_names, dotOpNames);
+  ADD_POPART_BOOL_OPTION_ALIAS(export_poplar_computation_graph,
+                               exportPoplarComputationGraph);
+  ADD_POPART_BOOL_OPTION_ALIAS(export_poplar_vertex_graph,
+                               exportPoplarVertexGraph);
+  ADD_POPART_BOOL_OPTION_ALIAS(separate_call_op_pdfs, separateCallOpPdfs);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_outlining, enableOutlining);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_outlining_copy_cost_pruning,
+                               enableOutliningCopyCostPruning);
+  ADD_POPART_BOOL_OPTION_ALIAS(rearrange_anchors_on_host,
+                               rearrangeAnchorsOnHost);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_prefetch_datastreams,
+                               enablePrefetchDatastreams);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_non_stable_softmax,
+                               enableNonStableSoftmax);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_replicated_graphs,
+                               enableReplicatedGraphs);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_gradient_accumulation,
+                               enableGradientAccumulation);
+  ADD_POPART_BOOL_OPTION_ALIAS(instrument_with_hardware_cycle_counter,
+                               instrumentWithHardwareCycleCounter);
   ADD_POPART_BOOL_OPTION_ALIAS(enable_pipelining, enablePipelining);
-  ADD_POPART_BOOL_OPTION(disableGradAccumulationTensorStreams);
-  ADD_POPART_BOOL_OPTION(compileEngine);
-  ADD_POPART_BOOL_OPTION(constantWeights);
-  ADD_POPART_BOOL_OPTION(enableEngineCaching);
-  ADD_POPART_BOOL_OPTION(enableMergeExchange);
-  ADD_POPART_BOOL_OPTION(enableFloatingPointChecks);
-  ADD_POPART_BOOL_OPTION(enableStochasticRounding);
+  ADD_POPART_BOOL_OPTION_ALIAS(disable_grad_accumulation_tensor_streams,
+                               disableGradAccumulationTensorStreams);
+  ADD_POPART_BOOL_OPTION_ALIAS(compile_engine, compileEngine);
+  ADD_POPART_BOOL_OPTION_ALIAS(constant_weights, constantWeights);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_engine_caching, enableEngineCaching);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_merge_exchange, enableMergeExchange);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_floating_point_checks,
+                               enableFloatingPointChecks);
   ADD_POPART_BOOL_OPTION_ALIAS(enable_stochastic_rounding,
                                enableStochasticRounding);
-  ADD_POPART_BOOL_OPTION(explicitRecomputation);
-  ADD_POPART_BOOL_OPTION(enableExplicitMainLoops);
-  ADD_POPART_BOOL_OPTION(useHostCopyOps);
-  ADD_POPART_BOOL_OPTION(aliasZeroCopy);
-  ADD_POPART_BOOL_OPTION(delayVarUpdates);
-  ADD_POPART_BOOL_OPTION(enableFullyConnectedPass);
-  ADD_POPART_BOOL_OPTION(enableSerializedMatmuls);
-  ADD_POPART_BOOL_OPTION(enableStableNorm);
-  ADD_POPART_BOOL_OPTION(decomposeGradSum);
-  ADD_POPART_BOOL_OPTION(enableDistributedReplicatedGraphs);
-  ADD_POPART_BOOL_OPTION(groupHostSync);
-  ADD_POPART_BOOL_OPTION(automaticLossScalingSettings.enabled);
-  ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter);
-  ADD_POPART_BOOL_OPTION(enableSupportedDataTypeCasting);
-  ADD_POPART_BOOL_OPTION(groupNormStridedChannelGrouping);
-  ADD_POPART_BOOL_OPTION(scheduleNonWeightUpdateGradientConsumersEarly);
-
-  ADD_POPART_DOUBLE_OPTION(outlineSequenceBreakCost);
-  ADD_POPART_DOUBLE_OPTION(outlineThreshold);
-  ADD_POPART_DOUBLE_OPTION(timeLimitScheduler);
-  ADD_POPART_DOUBLE_OPTION(automaticLossScalingSettings.binEdgeLocation);
-  ADD_POPART_DOUBLE_OPTION(
+  ADD_POPART_BOOL_OPTION_ALIAS(explicit_recomputation, explicitRecomputation);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_explicit_main_loops,
+                               enableExplicitMainLoops);
+  ADD_POPART_BOOL_OPTION_ALIAS(use_host_copy_ops, useHostCopyOps);
+  ADD_POPART_BOOL_OPTION_ALIAS(alias_zero_copy, aliasZeroCopy);
+  ADD_POPART_BOOL_OPTION_ALIAS(delay_var_updates, delayVarUpdates);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_fully_connected_pass,
+                               enableFullyConnectedPass);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_serialized_matmuls,
+                               enableSerializedMatmuls);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_stable_norm, enableStableNorm);
+  ADD_POPART_BOOL_OPTION_ALIAS(decompose_grad_sum, decomposeGradSum);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_distributed_replicated_graphs,
+                               enableDistributedReplicatedGraphs);
+  ADD_POPART_BOOL_OPTION_ALIAS(group_host_sync, groupHostSync);
+  ADD_POPART_BOOL_OPTION_ALIAS(automatic_loss_scaling_settings.enabled,
+                               automaticLossScalingSettings.enabled);
+  ADD_POPART_BOOL_OPTION_ALIAS(instrument_with_hardware_cycle_counter,
+                               instrumentWithHardwareCycleCounter);
+  ADD_POPART_BOOL_OPTION_ALIAS(enable_supported_data_type_casting,
+                               enableSupportedDataTypeCasting);
+  ADD_POPART_BOOL_OPTION_ALIAS(group_norm_strided_channel_grouping,
+                               groupNormStridedChannelGrouping);
+  ADD_POPART_BOOL_OPTION_ALIAS(
+      schedule_non_weight_update_gradient_consumers_early,
+      scheduleNonWeightUpdateGradientConsumersEarly);
+
+  ADD_POPART_DOUBLE_OPTION_ALIAS(outline_sequence_break_cost,
+                                 outlineSequenceBreakCost);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(outline_threshold, outlineThreshold);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(time_limit_scheduler, timeLimitScheduler);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(
+      automatic_loss_scaling_settings.bin_edge_location,
+      automaticLossScalingSettings.binEdgeLocation);
+  ADD_POPART_DOUBLE_OPTION_ALIAS(
+      automatic_loss_scaling_settings.threshold_upper_count_proportion,
       automaticLossScalingSettings.thresholdUpperCountProportion);
 
-#undef ADD_POPART_STRING_OPTION
-#undef ADD_POPART_DOUBLE_OPTION
-#undef ADD_POPART_UINT64_OPTION
-#undef ADD_POPART_BOOL_OPTION
-#undef ADD_POPART_ENUM_OPTION
 #undef ADD_POPART_STRING_OPTION_ALIAS
 #undef ADD_POPART_DOUBLE_OPTION_ALIAS
 #undef ADD_POPART_UINT64_OPTION_ALIAS
@@ -278,14 +308,14 @@ IpuStrategy::IpuStrategy() {
       });
 
   RegisterSetter(
-      container_options, "dotChecks",
+      container_options, "dot_checks",
       [&](const std::pair<std::string, std::string>& p) {
         std::uint64_t value = std::stoul(p.first);
         popart_options.dotChecks.insert(static_cast<popart::DotCheck>(value));
       });
 
   RegisterGetter(
-      vector_options_getter, options_type, "dotChecks", "vector", [&]() {
+      vector_options_getter, options_type, "dot_checks", "vector", [&]() {
         std::vector<std::string> res;
         for (auto x : popart_options.dotChecks) {
           res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
@@ -293,7 +323,7 @@ IpuStrategy::IpuStrategy() {
         return res;
       });
 
-  RegisterSetter(container_options, "hardwareInstrumentations",
+  RegisterSetter(container_options, "hardware_instrumentations",
                  [&](const std::pair<std::string, std::string>& p) {
                    std::uint64_t value = std::stoul(p.first);
                    popart_options.hardwareInstrumentations.insert(
@@ -301,8 +331,8 @@ IpuStrategy::IpuStrategy() {
                  });
 
   RegisterGetter(
-      vector_options_getter, options_type, "hardwareInstrumentations", "vector",
-      [&]() {
+      vector_options_getter, options_type, "hardware_instrumentations",
+      "vector", [&]() {
         std::vector<std::string> res;
         for (auto x : popart_options.hardwareInstrumentations) {
           res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
@@ -310,12 +340,12 @@ IpuStrategy::IpuStrategy() {
         return res;
       });
 
-  RegisterSetter(container_options, "customCodelets",
+  RegisterSetter(container_options, "custom_codelets",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.customCodelets.push_back(p.first);
                  });
 
-  RegisterGetter(vector_options_getter, options_type, "customCodelets",
+  RegisterGetter(vector_options_getter, options_type, "custom_codelets",
                  "vector", [&]() {
                    std::vector<std::string> res;
                    for (auto x : popart_options.customCodelets) {
@@ -324,44 +354,44 @@ IpuStrategy::IpuStrategy() {
                    return res;
                  });
 
-  RegisterSetter(container_options, "engineOptions",
+  RegisterSetter(container_options, "engine_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.engineOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "engineOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "engine_options", "map",
                  [&]() { return popart_options.engineOptions; });
 
-  RegisterSetter(container_options, "reportOptions",
+  RegisterSetter(container_options, "report_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.reportOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "reportOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "report_options", "map",
                  [&]() { return popart_options.reportOptions; });
 
-  RegisterSetter(container_options, "convolutionOptions",
+  RegisterSetter(container_options, "convolution_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.convolutionOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "convolutionOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "convolution_options", "map",
                  [&]() { return popart_options.convolutionOptions; });
 
-  RegisterSetter(container_options, "lstmOptions",
+  RegisterSetter(container_options, "lstm_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.lstmOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "lstmOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "lstm_options", "map",
                  [&]() { return popart_options.lstmOptions; });
 
-  RegisterSetter(container_options, "gclOptions",
+  RegisterSetter(container_options, "gcl_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.gclOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "gclOptions", "map",
+  RegisterGetter(map_options_getter, options_type, "gcl_options", "map",
                  [&]() { return popart_options.gclOptions; });
 }
 
@@ -415,21 +445,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
         "Unknown tensor location: %s", tensor));
   }
 
-  if (opt == "minElementsForOffChip") {
+  if (opt == "min_elements_for_off_chip") {
     settings->minElementsForOffChip = value;
-  } else if (opt == "minElementsForReplicatedTensorSharding") {
+  } else if (opt == "min_elements_for_replicated_tensor_sharding") {
     settings->minElementsForReplicatedTensorSharding = value;
-  } else if (opt == "onChip") {
+  } else if (opt == "on_chip") {
     settings->location.storage = value > 0 ? popart::TensorStorage::OnChip
                                            : popart::TensorStorage::OffChip;
-  } else if (opt == "useReplicatedTensorSharding") {
+  } else if (opt == "use_replicated_tensor_sharding") {
     settings->location.replicatedTensorSharding =
         value > 0 ? popart::ReplicatedTensorSharding::On
                   : popart::ReplicatedTensorSharding::Off;
-  } else if (opt == "useIOTilesToLoad") {
+  } else if (opt == "use_io_tiles_to_load") {
     settings->location.loadTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
-  } else if (opt == "useIOTilesToStore") {
+  } else if (opt == "use_io_tiles_to_store") {
     settings->location.storageTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
   } else {
@@ -464,6 +494,20 @@ std::string IpuStrategy::GetOptionType(const std::string& option) {
   return options_type[option];
 }
 
+std::vector<std::string> IpuStrategy::GetAllOptionNames() {
+  std::vector<std::string> names;
+  for (auto& option : options_getter) {
+    names.push_back(option.first);
+  }
+  for (auto& option : vector_options_getter) {
+    names.push_back(option.first);
+  }
+  for (auto& option : map_options_getter) {
+    names.push_back(option.first);
+  }
+  return names;
+}
+
 void IpuStrategy::EnablePattern(const std::string& t) {
   VLOG(10) << "enable popart pattern: " << t;
   popart_patterns.enablePattern(t, true);
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 64436dc14fe..571fb1e1637 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -24,7 +24,8 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-struct IpuStrategy {
+class IpuStrategy {
+ public:
   IpuStrategy();
 
   // TODO(alleng) create PaddleOptions
@@ -75,22 +76,30 @@ struct IpuStrategy {
   // custom ops
   std::vector<IpuCustomOpIdentifier> custom_ops;
 
- private:
-  std::map<std::string, std::function<void(bool)>> bool_options;
-  std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
-  std::map<std::string, std::function<void(double)>> double_options;
-  std::map<std::string, std::function<void(std::string)>> string_options;
-  std::map<std::string,
-           std::function<void(std::pair<std::string, std::string>)>>
-      container_options;
+ public:
+  void AddBoolOption(const std::string &option, bool value);
+  void AddUint64Option(const std::string &option, std::uint64_t value);
+  void AddDoubleOption(const std::string &option, double value);
+  void AddStringOption(const std::string &option, const std::string &value);
+  void InsertStringOption(const std::string &option, const std::string &value);
+  void InsertStringPairOption(const std::string &option, const std::string &key,
+                              const std::string &value);
+  void SetTensorLocation(const std::string &tensor, const std::string &option,
+                         std::uint64_t value);
+  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
+                   const std::string &domain, int version);
 
-  std::map<std::string, std::function<std::string()>> options_getter;
-  std::map<std::string, std::function<std::vector<std::string>()>>
-      vector_options_getter;
-  std::map<std::string, std::function<std::map<std::string, std::string>()>>
-      map_options_getter;
-  std::map<std::string, std::string> options_type;
+  std::string GetOption(const std::string &);
+  std::vector<std::string> GetVectorOption(const std::string &);
+  std::map<std::string, std::string> GetMapOption(const std::string &);
+  std::string GetOptionType(const std::string &);
+  std::vector<std::string> GetAllOptionNames();
+
+  void EnablePattern(const std::string &t);
+  void DisablePattern(const std::string &t);
+  const bool IsPatternEnabled(const std::string &t);
 
+ private:
   template <typename ValueType>
   void set(
       const std::string &key, ValueType value,
@@ -117,27 +126,20 @@ struct IpuStrategy {
     return it->second();
   }
 
- public:
-  void AddBoolOption(const std::string &option, bool value);
-  void AddUint64Option(const std::string &option, std::uint64_t value);
-  void AddDoubleOption(const std::string &option, double value);
-  void AddStringOption(const std::string &option, const std::string &value);
-  void InsertStringOption(const std::string &option, const std::string &value);
-  void InsertStringPairOption(const std::string &option, const std::string &key,
-                              const std::string &value);
-  void SetTensorLocation(const std::string &tensor, const std::string &option,
-                         std::uint64_t value);
-  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
-                   const std::string &domain, int version);
-
-  std::string GetOption(const std::string &);
-  std::vector<std::string> GetVectorOption(const std::string &);
-  std::map<std::string, std::string> GetMapOption(const std::string &);
-  std::string GetOptionType(const std::string &);
+  std::map<std::string, std::function<void(bool)>> bool_options;
+  std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
+  std::map<std::string, std::function<void(double)>> double_options;
+  std::map<std::string, std::function<void(std::string)>> string_options;
+  std::map<std::string,
+           std::function<void(std::pair<std::string, std::string>)>>
+      container_options;
 
-  void EnablePattern(const std::string &t);
-  void DisablePattern(const std::string &t);
-  const bool IsPatternEnabled(const std::string &t);
+  std::map<std::string, std::function<std::string()>> options_getter;
+  std::map<std::string, std::function<std::vector<std::string>()>>
+      vector_options_getter;
+  std::map<std::string, std::function<std::map<std::string, std::string>()>>
+      map_options_getter;
+  std::map<std::string, std::string> options_type;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6e553ad2e60..3d8815e2eb6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3919,6 +3919,8 @@ All parameter, weight, gradient are variables in Paddle.
              }
              return res;
            })
+      .def("get_all_option_names",
+           &platform::ipu::IpuStrategy::GetAllOptionNames)
       .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern)
       .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern)
       .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled);
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
new file mode 100644
index 00000000000..959700ad743
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(WITH_IPU)
+    file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+    string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+    foreach(TEST_OP ${TEST_OPS})
+        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    endforeach(TEST_OP)
+endif()
-- 
GitLab


From 4617c1b2da8b061015d4a23f01ad81109ea931a7 Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Tue, 1 Mar 2022 20:13:14 +0800
Subject: [PATCH 029/272] fix bug of paddle.to_tensor and paddle.moveaxis
 (#39662)

* fix bug of paddle.to_tensor and paddle.moveaxis

* fix CI
---
 .../tests/unittests/test_transpose_op.py      |  8 +++++
 .../fluid/tests/unittests/test_var_base.py    |  4 +++
 python/paddle/tensor/creation.py              | 31 +++++++++----------
 python/paddle/tensor/manipulation.py          |  7 +++--
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 0fc56726c5d..13b880b28bf 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -423,6 +423,14 @@ class TestMoveAxis(unittest.TestCase):
         self.assertEqual(np.array_equal(out.numpy(), expected), True)
         paddle.enable_static()
 
+    def test_moveaxis3(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(
+            [[1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j]])
+        out = x.moveaxis(0, 1)
+        self.assertEqual(out.shape, [2, 3])
+        paddle.enable_static()
+
     def test_error(self):
         x = paddle.randn([2, 3, 4, 5])
         # src must have the same number with dst
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 541df6659c2..dbd40c349bb 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -51,6 +51,10 @@ class TestVarBase(unittest.TestCase):
                     np.array_equal(x.numpy(), np.array([1.2], 'float16')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP16)
 
+                # set_default_dtype take effect on int
+                x = paddle.to_tensor(1, place=place)
+                self.assertTrue(x.dtype, core.VarDesc.VarType.INT64)
+
                 # set_default_dtype take effect on float
                 x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
                 self.assertTrue(
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index ae563e641e3..bddc45bc961 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -110,12 +110,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace"
         )
 
-    #Todo(zhouwei): Support allocate tensor on any other specified card
-    if isinstance(place, core.CUDAPlace) and isinstance(
-            _current_expected_place(), core.CUDAPlace) and place._get_device_id(
-            ) != _current_expected_place()._get_device_id():
-        place = _current_expected_place()
-
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
@@ -139,7 +133,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             data.stop_gradient = stop_gradient
             return data
         elif isinstance(data, (core.LoDTensor, core.Tensor)):
-            # Note(zhouwei25): should't expose it to users, just for internal use.
+            # should't expose it to users, just for internal use.
             # convert core.Tensor/core.LoDTensor to VarBase first
             # Currenly, there is no copy when places are same
             data = paddle.Tensor(data)
@@ -152,15 +146,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
                 format(type(data)))
-        if not dtype and data.dtype in [
-                'float16', 'float32', 'float64', 'complex64', 'complex128'
-        ]:
-            default_type = paddle.get_default_dtype()
-            if np.iscomplexobj(data):
-                default_type = 'complex64' if default_type in [
-                    'float16', 'float32'
-                ] else 'complex128'
-            data = data.astype(default_type)
+        if not dtype:
+            if data.dtype in [
+                    'float16', 'float32', 'float64', 'complex64', 'complex128'
+            ]:
+                default_type = paddle.get_default_dtype()
+                if np.iscomplexobj(data):
+                    default_type = 'complex64' if default_type in [
+                        'float16', 'float32'
+                    ] else 'complex128'
+                data = data.astype(default_type)
+            # Windows default type is 'int32', while Linux/Mac is 'int64'. Unify they.
+            if data.dtype in ['int32']:
+                default_type = "int64"
+                data = data.astype(default_type)
 
     if dtype and convert_dtype(dtype) != data.dtype:
         data = data.astype(convert_dtype(dtype))
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 53bb9a88075..fbd6197c1b9 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2737,9 +2737,10 @@ def moveaxis(x, source, destination, name=None):
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
-    check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'moveaxis')
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'moveaxis')
 
     helper = LayerHelper('moveaxis', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-- 
GitLab


From 72e462cd0115b41b9a855c3edb9ee0622b241527 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Tue, 1 Mar 2022 20:40:19 +0800
Subject: [PATCH 030/272] [ROCM] fix to get rocm number in script, test=develop
 (#39938)

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4d7451f4352..8528ba34e21 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1266,7 +1266,7 @@ function card_test() {
     elif [ "${WITH_ASCEND_CL}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
-        CUDA_DEVICE_COUNT=4
+        CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
-- 
GitLab


From 852a872f6dafb3f8f32b30567d8402651f8e9e1e Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 1 Mar 2022 21:00:59 +0800
Subject: [PATCH 031/272] Added attr & tensor type mapping for final state
 codegen (#39997)

---
 .../final_state_generator/eager_gen.py        | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index c6e56e34627..02183e2ca5c 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -24,6 +24,17 @@ core_ops_args_info = {}
 core_ops_args_type_info = {}
 
 
+yaml_types_mapping = {
+    'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
+  'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+  'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \
+  'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
+    'Tensor' : 'Tensor',
+    'Tensor[]' : 'std::vector<Tensor>',
+    'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>'
+}
+
+
 def ParseArguments():
     parser = argparse.ArgumentParser(
         description='Eager Code Generator Args Parser')
@@ -59,7 +70,9 @@ def IsPlainTensorType(string):
 
 
 def IsVectorTensorType(string):
-    vector_tensor_types = ['list(Tensor)']
+    vector_tensor_types = [
+        'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
+    ]
     if string in vector_tensor_types:
         return True
     return False
@@ -180,6 +193,9 @@ def ParseYamlArgs(string):
         arg_name = m.group(3).split("=")[0].strip()
         default_value = m.group(3).split("=")[1].strip() if len(
             m.group(3).split("=")) > 1 else None
+
+        assert arg_type in yaml_types_mapping.keys()
+        arg_type = yaml_types_mapping[arg_type]
         if "Tensor" in arg_type:
             assert default_value is None
             inputs_list.append([arg_name, arg_type, i])
@@ -219,6 +235,10 @@ def ParseYamlReturnsWithName(string):
         m = re.search(pattern, ret)
         ret_type = m.group(1)
         ret_name = m.group(2)
+
+        assert ret_type in yaml_types_mapping.keys()
+        ret_type = yaml_types_mapping[ret_type]
+
         assert "Tensor" in ret_type
         returns_list.append([ret_name, ret_type, i])
 
-- 
GitLab


From acdf0663ae98fee60ea61ef25bb3e8af7d88f6b4 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Wed, 2 Mar 2022 09:42:20 +0800
Subject: [PATCH 032/272] update pd_2_trt lower pass (#40019)

* update pd_2_trt lower pass

* update pd_2_trt lower pass

* update style

* udpate

* change trt.graph to trt.create_engine

* update comments

* update comments

* add test
---
 .../dialect/tensorrt/trt_graph_fuse_pass.cc   | 20 +++++++++---------
 .../dialect/tensorrt/trt_graph_fuse_pass.h    | 21 ++++++++++++-------
 .../dialect/tensorrt/trt_graph_split_pass.cc  |  7 +++----
 .../dialect/tensorrt/trt_graph_split_pass.h   |  9 ++++++--
 .../dialect/tensorrt/trt_op_converter_pass.cc | 12 +++++------
 .../dialect/tensorrt/trt_op_converter_pass.h  |  8 +++----
 .../dialect/tensorrt/trt_op_teller_pass.cc    | 17 +++++++--------
 .../dialect/tensorrt/trt_op_teller_pass.h     | 17 +++++++++------
 paddle/infrt/dialect/tensorrt/trt_ops.h       |  1 +
 paddle/infrt/dialect/tensorrt/trt_ops.td      | 15 ++-----------
 .../{disabled_trt_ops.mlir => trt_ops.mlir}   |  1 +
 paddle/infrt/tests/lit.cfg.py.in              |  3 ++-
 12 files changed, 67 insertions(+), 64 deletions(-)
 rename paddle/infrt/tests/dialect/{disabled_trt_ops.mlir => trt_ops.mlir} (98%)

diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 17633a4e8e9..fa0095363c5 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -53,9 +53,9 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
-                             mlir::pd::GraphOp first,
-                             mlir::pd::GraphOp second) {
+void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
+                                    CreateEngineOp first,
+                                    CreateEngineOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,7 +84,8 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
+  auto graph_op =
+      builder.create<CreateEngineOp>(loc, return_types, inputs, true);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -97,7 +98,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<mlir::pd::ReturnOp>(loc, outputs);
+  builder.create<::infrt::dialect::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
@@ -149,13 +150,12 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      mlir::pd::GraphOp graph_op =
-          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
+      CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        mlir::pd::GraphOp user_graph_op =
-            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
+        CreateEngineOp user_graph_op =
+            ::llvm::dyn_cast_or_null<CreateEngineOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
@@ -168,7 +168,7 @@ void TRTGraphFusePass::runOnFunction() {
         // Reverse DFS from the source_nodes.
         if (!reverseDfs(source_nodes,
                         [&op](const mlir::Operation *n) { return n == &op; })) {
-          mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
+          mergeTwoAdjacentCreateEngineOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
         }
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index ebd7a4ac4bd..350add905aa 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,28 +28,28 @@ namespace trt {
  *
  * func @main() -> tensor<?xf32> {
  *  %a = "pd.feed"()...
- *  %c = "pd.graph"(%a) {
+ *  %c = "trt.create_engine"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.return" %m
+ *     "Infrt.return" %m
  *  } ...
- *  %d = "pd.graph"(%c) {
+ *  %d = "trt.create_engine"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.return" %m
+ *      "Infrt.return" %m
  *  } ...
- *  %f = "pd.graph"(%a) {
+ *  %f = "trt.create_engine"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.return" %m
+ *      "Infrt.return" %m
  *  } ...
  *  "pd.fetch" %d, %f
  *
  * destination func:
  * func @main() -> tensor<?xf32> {
  *  %a = "pd.feed"()...
- *  %d, %f = "pd.graph"(%a) {
+ *  %d, %f = "trt.create_engine"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.return" %n, %s
+ *     "Infrt.return" %n, %s
  *  } ...
  *  "pd.fetch" %d, %f
  * }
@@ -55,6 +57,9 @@ namespace trt {
 class TRTGraphFusePass
     : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
+  }
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index f24b9cc40cd..5ee7b23213a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -22,18 +22,17 @@ namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<mlir::pd::GraphOp> worklist;
+  std::vector<CreateEngineOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    mlir::pd::GraphOp graph_op =
-        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
+    CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    mlir::pd::GraphOp graph_op = worklist.back();
+    CreateEngineOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 51f84227243..28078e2bc2d 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -27,11 +29,11 @@ namespace trt {
  *
  * func @main() -> tensor<?xf32> {
  *  %a = "pd.feed"()...
- *  %d, %f = "pd.graph"(%a) {
+ *  %d, %f = "trt.create_engine"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.return" (%n, %s)
+ *     "Infrt.return" (%n, %s)
  *  } ...
  *  "pd.fetch" (%d, %f)
  * }
@@ -49,6 +51,9 @@ class TRTGraphSplitPass
     : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
+  }
   void runOnFunction() override;
   explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index e34308a2f0f..8d81e739d9c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Transforms/DialectConversion.h"
+#include <mlir/IR/Builders.h>
+#include <mlir/Transforms/DialectConversion.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 
@@ -22,12 +22,10 @@ namespace trt {
 
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
-using namespace mlir;
-
 void TRTOpConverterPass::runOnOperation() {
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
-  ConversionTarget target(getContext());
+  ::mlir::ConversionTarget target(getContext());
 
   // We define the specific operations, or dialects, that are legal targets for
   // this lowering. In our case, we are lowering to TensorRTDialect from
@@ -36,13 +34,13 @@ void TRTOpConverterPass::runOnOperation() {
 
   // Now that the conversion target has been defined, we just need to provide
   // the set of patterns that will lower the TensorRT operations.
-  RewritePatternSet patterns(&getContext());
+  ::mlir::RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
   // operations were not converted successfully.
-  if (failed(
+  if (::mlir::failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
     signalPassFailure();
 }
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index 0adbf11b891..a8128a585ee 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -25,11 +25,11 @@ namespace trt {
  * source ir:
  * func @main() -> tensor<?xf32> {
  *   %a = "pd.feed"()...
- *   %d, %f = "pd.graph"(%a) {
+ *   %d, %f = "trt.create_engine"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.return" %n, %s
+ *     "Infrt.return" %n, %s
  *   } ...
  *   "pd.fetch" %d, %f
  * }
@@ -37,11 +37,11 @@ namespace trt {
  * destination ir:
  * func @main() -> tensor<?xf32> {
  *   %a = "pd.feed"()...
- *   %d, %f = "pd.graph"(%a) {
+ *   %d, %f = "trt.create_engine"(%a) {
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "pd.return" %n, %s
+ *     "Infrt.return" %n, %s
  *   } ...
  *   "pd.fetch" %d, %f
  * }
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 176fdb7a2e0..17e893a383a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 #include <mlir/IR/Builders.h>
+#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
@@ -33,16 +34,14 @@ void TRTOpTellerPass::runOnFunction() {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    auto op1 = ::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op);
-    if (op1) continue;
-    auto op2 = ::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op);
-    if (op2) continue;
-    auto op3 = ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op);
-    if (op3) continue;
+    if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<CreateEngineOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<mlir::pd::GraphOp>(
-        loc, op->getResultTypes(), op->getOperands());
+    auto graph_op = builder.create<CreateEngineOp>(
+        loc, op->getResultTypes(), op->getOperands(), true);
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
@@ -55,7 +54,7 @@ void TRTOpTellerPass::runOnFunction() {
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<mlir::pd::ReturnOp>(loc, op->getResults());
+    builder.create<::infrt::dialect::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 8b9a16376ce..471eafa9f9b 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -35,17 +37,17 @@ namespace trt {
  * destination func:
  * func @main() -> tensor<?xf32> {
  *  %a = "pd.feed"()...
- *  %c = "pd.graph"(%a) {
+ *  %c = "trt.create_engine"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.return" (%m)
+ *     "Infrt.return" (%m)
  *  } ...
- *  %d = "pd.graph"(%c) {
+ *  %d = "trt.create_engine"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.return" (%m)
+ *      "Infrt.return" (%m)
  *  } ...
- *  %f = "pd.graph"(%a) {
+ *  %f = "trt.create_engine"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.return" (%m)
+ *      "Infrt.return" (%m)
  *  } ...
  *  "pd.fetch" (%d, %f)
  * }
@@ -55,6 +57,9 @@ namespace trt {
 class TRTOpTellerPass
     : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
+  }
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index a37491ec1ab..95b2ed41fdf 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/basic_kernels.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 8e3dfffff54..31142a5157b 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,25 +7,14 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
-def TRT_FetchOp : TRT_Op<"fetch", [Terminator]> {
-  let summary = "TensorRT engine return operation";
-  let description = [{
-    The `trt.fetch` operation terminates and returns values for the
-    `trt.graph` operation.
-    }];
-
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
-}
-
-def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
+def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::dialect::ReturnOp">]> {
   let summary = "trt Graph Op";
   let description = [{
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
+  let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
   let results = (outs Variadic<TRT_Tensor>:$outputs);
-
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
similarity index 98%
rename from paddle/infrt/tests/dialect/disabled_trt_ops.mlir
rename to paddle/infrt/tests/dialect/trt_ops.mlir
index b59cfb04816..49510bc542d 100644
--- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,3 +1,4 @@
+// RUN: trt-exec %s
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
   %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in
index 19ee0076b55..d47957dac92 100644
--- a/paddle/infrt/tests/lit.cfg.py.in
+++ b/paddle/infrt/tests/lit.cfg.py.in
@@ -21,10 +21,11 @@ build_dir = "@CMAKE_BINARY_DIR@"
 config.llvm_tools_dir = os.path.join(build_dir, "third_party/install/llvm/bin")
 config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib")
 infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/")
+trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/")
 infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/")
 
 llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/")
 config.environment['PATH'] = os.path.pathsep.join(
-    (infrtopt_bin, infrtexec_bin, llvm_bin, config.environment['PATH']))
+    (infrtopt_bin, infrtexec_bin, trtexec_bin, llvm_bin, config.environment['PATH']))
 
 config.suffixes = ['.mlir']
-- 
GitLab


From fb0cadfd2fa159a3d949357300a668a9cff75802 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Wed, 2 Mar 2022 10:05:45 +0800
Subject: [PATCH 033/272] Fix bug for prepare phi OP (#40033)

---
 paddle/fluid/imperative/prepared_operator.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 9dd1dacc02c..2317bfdd7c0 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -186,11 +186,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
               << " | kernel key: " << pt_kernel_key
               << " | kernel: " << pt_kernel;
 
-      if (platform::is_cpu_place(expected_kernel_key.place_)) {
-        auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace());
-        return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                          pt_kernel, cpu_ctx);
+      if (expected_kernel_key.place_ != place) {
+        dev_ctx = pool.Get(expected_kernel_key.place_);
       }
+
       // TODO(chenweihang): using CPUKernel when miss device kernel case
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
                         pt_kernel, dev_ctx);
-- 
GitLab


From dbcf879758db039d68b5c6018b9229f4548e8702 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Wed, 2 Mar 2022 10:17:29 +0800
Subject: [PATCH 034/272] [Eager] Support gnn ptb_rnn in eager mode (#39993)

---
 .../paddle/fluid/tests/unittests/test_imperative_gnn.py   | 8 +++++++-
 .../unittests/test_imperative_ptb_rnn_sorted_gradient.py  | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index c813aeede6f..a5a90461551 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -23,6 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid.optimizer import AdamOptimizer
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 def gen_data():
@@ -60,7 +61,7 @@ class GCN(fluid.Layer):
 
 
 class TestDygraphGNN(unittest.TestCase):
-    def test_gnn_float32(self):
+    def func_gnn_float32(self):
         paddle.seed(90)
         paddle.framework.random._manual_program_seed(90)
         startup = fluid.Program()
@@ -168,6 +169,11 @@ class TestDygraphGNN(unittest.TestCase):
         self.assertTrue(np.allclose(static_weight, model2_gc_weight_value))
         sys.stderr.write('%s %s\n' % (static_loss, loss_value))
 
+    def test_gnn_float32(self):
+        with _test_eager_guard():
+            self.func_gnn_float32()
+        self.func_gnn_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index e5453eed136..f659d834354 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -26,10 +26,11 @@ from test_imperative_base import new_program_scope
 from test_imperative_ptb_rnn import PtbModel
 import numpy as np
 import six
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDygraphPtbRnnSortGradient(unittest.TestCase):
-    def test_ptb_rnn_sort_gradient(self):
+    def func_ptb_rnn_sort_gradient(self):
         for is_sparse in [True, False]:
             self.ptb_rnn_sort_gradient_cpu_float32(is_sparse)
 
@@ -171,6 +172,11 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
         for key, value in six.iteritems(static_param_updated):
             self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
+    def test_ptb_rnn_sort_gradient(self):
+        with _test_eager_guard():
+            self.func_ptb_rnn_sort_gradient()
+        self.func_ptb_rnn_sort_gradient()
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From e4dba69a2fdc793ca399042e688256108e0098fb Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Wed, 2 Mar 2022 10:23:15 +0800
Subject: [PATCH 035/272] [Pten] Gru lstm migration (#39729)

* move sequence2batch

* move lstm and gru

* Add phi/kernels directory into exclusion to stop using hipcc to compile non .cu files in it.
---
 cmake/generic.cmake                           |   4 +-
 .../fused/fused_embedding_fc_lstm_op.cc       |   6 +-
 paddle/fluid/operators/fused/fusion_gru_op.cc |   6 +-
 .../fluid/operators/fused/fusion_lstm_op.cc   |   6 +-
 paddle/fluid/operators/fused/multi_gru_op.cc  |   2 +-
 paddle/fluid/operators/gru_op.cc              |  28 +-
 paddle/fluid/operators/gru_op.cu.cc           |  12 +-
 paddle/fluid/operators/gru_op.h               |  22 +-
 paddle/fluid/operators/lstm_op.h              |  38 +-
 paddle/fluid/operators/lstmp_op.h             |  68 +--
 paddle/fluid/operators/math/CMakeLists.txt    |   6 +-
 paddle/fluid/operators/math/gru_compute.h     |  80 ----
 paddle/fluid/operators/math/lstm_compute.cc   |  93 ----
 paddle/fluid/operators/math/lstm_compute.cu   |  59 ---
 paddle/fluid/operators/rnn_op.h               |  64 +--
 paddle/phi/kernels/funcs/CMakeLists.txt       |   4 +
 .../kernels/funcs}/detail/CMakeLists.txt      |   0
 .../funcs}/detail/activation_functions.h      |  68 +--
 .../kernels/funcs}/detail/avx_functions.cc    |  19 +-
 .../kernels/funcs}/detail/avx_mathfun.h       |   6 +-
 .../kernels/funcs}/detail/gru_cpu_kernel.h    | 451 ++++++++++++------
 .../kernels/funcs}/detail/gru_gpu_kernel.h    | 106 ++--
 .../kernels/funcs}/detail/gru_kernel.h        | 150 +++---
 .../kernels/funcs}/detail/lstm_cpu_kernel.h   | 266 ++++++++---
 .../kernels/funcs}/detail/lstm_gpu_kernel.h   | 159 ++++--
 .../kernels/funcs}/detail/lstm_kernel.h       | 123 +++--
 paddle/phi/kernels/funcs/gru_compute.cc       | 373 +++++++++++++++
 paddle/phi/kernels/funcs/gru_compute.cu       | 349 ++++++++++++++
 paddle/phi/kernels/funcs/gru_compute.h        |  88 ++++
 paddle/phi/kernels/funcs/lstm_compute.cc      | 103 ++++
 paddle/phi/kernels/funcs/lstm_compute.cu      |  76 +++
 .../math => phi/kernels/funcs}/lstm_compute.h |  39 +-
 .../kernels/funcs}/sequence2batch.cc          |  62 +--
 .../kernels/funcs}/sequence2batch.cu          |  72 +--
 .../kernels/funcs}/sequence2batch.h           |  66 +--
 35 files changed, 2181 insertions(+), 893 deletions(-)
 delete mode 100644 paddle/fluid/operators/math/gru_compute.h
 delete mode 100644 paddle/fluid/operators/math/lstm_compute.cc
 delete mode 100644 paddle/fluid/operators/math/lstm_compute.cu
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/CMakeLists.txt (100%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/activation_functions.h (75%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/avx_functions.cc (87%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/avx_mathfun.h (99%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/gru_cpu_kernel.h (60%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/gru_gpu_kernel.h (74%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/gru_kernel.h (64%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/lstm_cpu_kernel.h (65%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/lstm_gpu_kernel.h (68%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/lstm_kernel.h (59%)
 create mode 100644 paddle/phi/kernels/funcs/gru_compute.cc
 create mode 100644 paddle/phi/kernels/funcs/gru_compute.cu
 create mode 100644 paddle/phi/kernels/funcs/gru_compute.h
 create mode 100644 paddle/phi/kernels/funcs/lstm_compute.cc
 create mode 100644 paddle/phi/kernels/funcs/lstm_compute.cu
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/lstm_compute.h (56%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/sequence2batch.cc (56%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/sequence2batch.cu (55%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/sequence2batch.h (80%)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 51ed537ce5d..da81575188f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -580,8 +580,8 @@ function(hip_library TARGET_NAME)
     cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     if(hip_library_SRCS)
       # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
-      if(NOT ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators")
-        set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
+       set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
       endif()
       if (hip_library_SHARED OR hip_library_shared) # build *.so
         hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 56c2c86e1a7..0c83c36b475 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -473,7 +473,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     cell_out->mutable_data<T>(place);
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
@@ -591,7 +591,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
 #undef MOVE_ONE_BATCH
 #undef DEFINE_CUR
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_h_out, hidden_out);
     batched_c_out->set_lod(batched_lod);
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 41a69031c54..3311e3b4ebc 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -368,7 +368,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     math::FCFunctor<DeviceContext, T> fc;
     if (M > D3) {
@@ -463,7 +463,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       batched_input_data = cur_batched_data;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_out, hidden_out);
   }
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 06d406867f0..00be8b09d12 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -421,7 +421,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     cell_out->mutable_data<T>(place);
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     math::FCFunctor<DeviceContext, T> fc;
@@ -514,7 +514,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       batched_input_data = cur_in_data;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_h_out, hidden_out);
     batched_c_out->set_lod(batched_lod);
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index 84826ff3993..c2260c53b2e 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 88530b5352d..d7cf03ddd61 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
 #include <memory>
 #include <string>
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
 
 DECLARE_int32(paddle_num_threads);
 
@@ -316,7 +316,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     batch_hidden->mutable_data<T>(context.GetPlace());
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
@@ -326,7 +326,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     }
 
     int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
@@ -347,9 +347,9 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t seq_len = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
+    auto active_node = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
+    auto active_gate = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("gate_activation"));
 
 #ifdef PADDLE_WITH_MKLML
@@ -396,9 +396,9 @@ class GRUCPUKernel : public framework::OpKernel<T> {
               frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
         }
 
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
+        phi::funcs::detail::forward_reset_output(
+            phi::funcs::detail::forward::gru_resetOutput<T>(), gru_value,
+            frame_size, cur_batch_size, active_gate);
 
         if (gru_value.prev_out_value) {
           blas.GEMM_COMPUTE(
@@ -408,9 +408,9 @@ class GRUCPUKernel : public framework::OpKernel<T> {
               frame_size * 3);
         }
 
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node, origin_mode);
+        phi::funcs::detail::forward_final_output(
+            phi::funcs::detail::forward::gru_finalOutput<T>(), gru_value,
+            frame_size, cur_batch_size, active_node, origin_mode);
 
         gru_value.prev_out_value = gru_value.output_value;
       }
@@ -432,7 +432,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
         gru_value.gate_value = gate_t.data<T>();
         gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
+        phi::funcs::GRUUnitFunctor<DeviceContext, T>::compute(
             dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
             active_gate, origin_mode);
 
@@ -441,7 +441,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
     }
 #endif
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, hidden);
   }
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index 7d055240916..5be0acc1543 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -65,7 +65,7 @@ class GRUKernel : public framework::OpKernel<T> {
     batch_hidden->mutable_data<T>(context.GetPlace());
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
@@ -75,7 +75,7 @@ class GRUKernel : public framework::OpKernel<T> {
     }
 
     int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
@@ -96,9 +96,9 @@ class GRUKernel : public framework::OpKernel<T> {
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
+    auto active_node = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
+    auto active_gate = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("gate_activation"));
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -111,13 +111,13 @@ class GRUKernel : public framework::OpKernel<T> {
       gru_value.output_value = hidden_t.data<T>();
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
+      phi::funcs::GRUUnitFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
           active_gate, origin_mode);
       gru_value.prev_out_value = gru_value.output_value;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, hidden);
   }
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 130b10c7390..852655034c8 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +32,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
                              framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
@@ -63,7 +63,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
     int frame_size = hidden_dims[1];
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
     batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
@@ -93,12 +93,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
     batch_hidden_grad.set_lod(batch_hidden->lod());
     to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse);
 
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
-    math::GRUMetaGrad<T> gru_grad;
+    phi::funcs::GRUMetaGrad<T> gru_grad;
     if (weight_grad) {
       gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
@@ -112,9 +112,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_hidden_grad.lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
+    auto active_node = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
+    auto active_gate = phi::funcs::detail::GetActivationType(
         context.Attr<std::string>("gate_activation"));
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -145,13 +145,13 @@ class GRUGradKernel : public framework::OpKernel<T> {
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
       gru_value.output_value = nullptr;
-      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
+      phi::funcs::GRUUnitGradFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
           active_gate, origin_mode);
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+      phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
       to_seq(dev_ctx, batch_gate_grad, input_grad);
     }
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 62f9cd26c41..4ec3072a96d 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +31,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
                              framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
@@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
@@ -80,7 +80,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
-    math::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaValue<T> lstm_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmMetaValue will be updated later.
@@ -121,11 +121,11 @@ class LSTMKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
 
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
@@ -166,13 +166,13 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
       T cell_clip = 0.0;
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
           gate_act, cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_hidden, hidden_out);
@@ -241,7 +241,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
                 ") should be %d, but received %d in LSTM@Grad operator.",
             frame_size, out_dims[1]));
 
-    math::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaValue<T> lstm_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       lstm_value.check_ig = bias_data + 4 * frame_size;
@@ -253,7 +253,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_value.check_og = nullptr;
     }
 
-    math::LstmMetaGrad<T> lstm_grad;
+    phi::funcs::LstmMetaGrad<T> lstm_grad;
 
     if (bias && bias_g) {
       bias_g->mutable_data<T>(ctx.GetPlace());
@@ -270,7 +270,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
         const DeviceContext& ctx, const framework::LoDTensor& src,
@@ -293,11 +293,11 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
 
     auto batch_starts = batch_gate->lod()[0];
@@ -338,7 +338,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;
       T cell_clip = 0.0;
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           cell_clip, gate_act, cell_act, cand_act);
 
@@ -369,7 +369,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 96c074f1efb..5d24c0b70d3 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -18,12 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -72,7 +72,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
                              framework::Vector<size_t> index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, dst, indexed_src);
 }
@@ -81,15 +81,15 @@ template <typename DeviceContext, typename T>
 class LSTMPKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
-  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
-                  X x, Y y, platform::Place place) const {
-    if (act_type == math::detail::ActivationType::kIdentity) {
+  void ActCompute(const phi::funcs::detail::ActivationType act_type,
+                  const Device& d, X x, Y y, platform::Place place) const {
+    if (act_type == phi::funcs::detail::ActivationType::kIdentity) {
       y.device(d) = x;
-    } else if (act_type == math::detail::ActivationType::kSigmoid) {
+    } else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) {
       SigmoidFunctor<T>()(d, x, y);
-    } else if (act_type == math::detail::ActivationType::kTanh) {
+    } else if (act_type == phi::funcs::detail::ActivationType::kTanh) {
       TanhFunctor<T>()(d, x, y);
-    } else if (act_type == math::detail::ActivationType::kReLU) {
+    } else if (act_type == phi::funcs::detail::ActivationType::kReLU) {
       if (place == platform::CPUPlace())
         ReluCPUFunctor<T>()(d, x, y);
       else
@@ -120,7 +120,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
@@ -137,7 +137,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
-    math::LstmMetaValue<T> lstmp_value;
+    phi::funcs::LstmMetaValue<T> lstmp_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmpMetaValue will be updated later.
@@ -176,13 +176,13 @@ class LSTMPKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
+    auto proj_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
@@ -222,13 +222,13 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.output_value = hidden_t.data<T>();
       lstmp_value.state_value = cell_t.data<T>();
       lstmp_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
           gate_act, cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
       blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
                   &proj_t, static_cast<T>(0.0));
-      if (proj_act != math::detail::ActivationType::kIdentity) {
+      if (proj_act != phi::funcs::detail::ActivationType::kIdentity) {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev, ctx.GetPlace());
       }
@@ -242,7 +242,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_proj.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_proj, proj_out);
@@ -257,16 +257,16 @@ template <typename DeviceContext, typename T>
 class LSTMPGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const math::detail::ActivationType act_type,
+  void ActGradCompute(const phi::funcs::detail::ActivationType act_type,
                       const Device& d, X x, Y y, DX dx, DY dy) const {
     // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == math::detail::ActivationType::kIdentity)
+    if (act_type == phi::funcs::detail::ActivationType::kIdentity)
       dx.device(d) = dy;
-    else if (act_type == math::detail::ActivationType::kSigmoid)
+    else if (act_type == phi::funcs::detail::ActivationType::kSigmoid)
       SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kTanh)
+    else if (act_type == phi::funcs::detail::ActivationType::kTanh)
       TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kReLU)
+    else if (act_type == phi::funcs::detail::ActivationType::kReLU)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
       PADDLE_THROW(
@@ -340,7 +340,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                           "but received %d in LSTMP@Grad operator.",
                           frame_size, out_dims[1]));
 
-    math::LstmMetaValue<T> lstmp_value;
+    phi::funcs::LstmMetaValue<T> lstmp_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       lstmp_value.check_ig = bias_data + 4 * frame_size;
@@ -352,7 +352,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_value.check_og = nullptr;
     }
 
-    math::LstmMetaGrad<T> lstmp_grad;
+    phi::funcs::LstmMetaGrad<T> lstmp_grad;
 
     if (bias && bias_g) {
       bias_g->mutable_data<T>(ctx.GetPlace());
@@ -369,7 +369,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
         const DeviceContext& ctx, const framework::LoDTensor& src,
@@ -393,13 +393,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = math::detail::GetActivationType(
+    auto gate_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
+    auto cell_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
+    auto cand_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
+    auto proj_act = phi::funcs::detail::GetActivationType(
         ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
@@ -423,7 +423,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
               _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
       }
 
-      if (proj_act != math::detail::ActivationType::kIdentity) {
+      if (proj_act != phi::funcs::detail::ActivationType::kIdentity) {
         auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
         auto proj_g_dev = EigenMatrix<T>::From(proj_g);
         ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
@@ -470,7 +470,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_value.output_value = nullptr;
       lstmp_grad.state_active_grad = nullptr;
 
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
           cell_clip, gate_act, cell_act, cand_act);
 
@@ -503,7 +503,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index ac6566a8703..ba047355ad7 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_subdirectory(detail)
-
 if (WITH_ASCEND_CL)
   cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner)
 endif()
@@ -18,8 +16,7 @@ math_library(im2col)
 math_library(sample_prob)
 math_library(sampler DEPS generator)
 
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
+# math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
 math_library(pooling)
 
@@ -29,7 +26,6 @@ else()
     math_library(selected_rows_functor DEPS selected_rows_utils math_function blas)
 endif()
 
-math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
deleted file mode 100644
index 70cbfecefc8..00000000000
--- a/paddle/fluid/operators/math/gru_compute.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  const T *gate_weight;
-  const T *state_weight;
-  const T *reset_bias;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  const T *prev_out_value;
-};
-
-template <typename T>
-struct GRUMetaGrad {
-  T *gate_weight_grad;
-  T *state_weight_grad;
-  T *gate_grad;
-  T *reset_output_grad;
-  T *output_grad;
-  T *prev_out_grad;
-  T *bias_hh_grad;
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitGradFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitFunctorV2 {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitGradFunctorV2 {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
deleted file mode 100644
index aa4fe65a520..00000000000
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/lstm_compute.h"
-
-#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(context, detail::forward::lstm<T>(), value,
-                               frame_size, cell_clip, cand_act, gate_act,
-                               cell_act, old_api_version);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(context, detail::backward::lstm<T>(), value,
-                                grad, frame_size, cell_clip, cand_act, gate_act,
-                                cell_act, old_api_version);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
-template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
deleted file mode 100644
index 4342cb7b799..00000000000
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cell_clip, cand_act,
-                                gate_act, cell_act);
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cell_clip, cand_act,
-                              gate_act, cell_act);
-  }
-};
-
-template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index c18570af775..b636184ae45 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -20,13 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/unique_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -100,7 +100,7 @@ struct Cell {
 };
 
 template <typename T, template <typename> class EigenActivationFunctor,
-          math::detail::ActivationType act_type>
+          phi::funcs::detail::ActivationType act_type>
 struct SimpleRNNCell : Cell<T> {
   void operator()(const platform::CPUDeviceContext* device_ctx, Tensor* input,
                   const Tensor* weight_hh, const Tensor* init_h,
@@ -148,7 +148,7 @@ struct GRUCell : Cell<T> {
     size_t frame_size = init_h->dims()[2];
     size_t batch_size = init_h->dims()[1];
 
-    math::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = weight_hh->data<T>();
     gru_value.state_weight = weight_hh->data<T>() + 2 * frame_size * frame_size;
     gru_value.reset_bias = bias_hh->data<T>() + 2 * frame_size;
@@ -158,10 +158,10 @@ struct GRUCell : Cell<T> {
     gru_value.output_value = output->data<T>();
     gru_value.prev_out_value = init_h->data<T>();
 
-    auto gate_act = math::detail::GetActivationType("sigmoid_v2");
-    auto cand_act = math::detail::GetActivationType("tanh_v2");
+    auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2");
 
-    math::GRUUnitFunctorV2<platform::CPUDeviceContext, T>::compute(
+    phi::funcs::GRUUnitFunctorV2<platform::CPUDeviceContext, T>::compute(
         *device_ctx, gru_value, frame_size, batch_size, cand_act, gate_act);
   }
 };
@@ -184,14 +184,14 @@ struct LSTMCell : Cell<T> {
     blas.MatMul(*init_h, mat_dim_a, *weight_hh, mat_dim_b, static_cast<T>(1.0),
                 input, static_cast<T>(1.0));
 
-    math::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaValue<T> lstm_value;
     lstm_value.check_ig = nullptr;
     lstm_value.check_fg = nullptr;
     lstm_value.check_og = nullptr;
 
-    auto gate_act = math::detail::GetActivationType("sigmoid_v2");
-    auto cell_act = math::detail::GetActivationType("tanh_v2");
-    auto cand_act = math::detail::GetActivationType("tanh_v2");
+    auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto cell_act = phi::funcs::detail::GetActivationType("tanh_v2");
+    auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2");
 
     size_t frame_size = init_h->dims()[2];
     size_t batch_size = init_h->dims()[1];
@@ -208,7 +208,7 @@ struct LSTMCell : Cell<T> {
     lstm_value.state_value = last_c->data<T>();
     lstm_value.state_active_value = last_c_act->data<T>();
     T cell_clip = 0.0;
-    math::LstmUnitFunctor<platform::CPUDeviceContext, T>::compute(
+    phi::funcs::LstmUnitFunctor<platform::CPUDeviceContext, T>::compute(
         *device_ctx, lstm_value, frame_size, batch_size, cell_clip, gate_act,
         cell_act, cand_act, false);
   }
@@ -986,18 +986,18 @@ class RNNCPUKernel : public framework::OpKernel<T> {
           seed, reserve_data);
     } else if (is_rnn_relu(ctx)) {
       gate_num = 1;
-      RnnFunc<
-          SimpleRNNCell<T, ReluCPUFunctor, math::detail::ActivationType::kReLU>,
-          Layer, SingleLayer, BidirLayer, T>(
+      RnnFunc<SimpleRNNCell<T, ReluCPUFunctor,
+                            phi::funcs::detail::ActivationType::kReLU>,
+              Layer, SingleLayer, BidirLayer, T>(
           ctx, input, weight_list, pre_state[0], nullptr, sequence_length,
           state[0], nullptr, output, dropout_mask, num_layers, gate_num,
           input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test,
           seed, reserve_data);
     } else if (is_rnn_tanh(ctx)) {
       gate_num = 1;
-      RnnFunc<
-          SimpleRNNCell<T, TanhFunctor, math::detail::ActivationType::kTanhV2>,
-          Layer, SingleLayer, BidirLayer, T>(
+      RnnFunc<SimpleRNNCell<T, TanhFunctor,
+                            phi::funcs::detail::ActivationType::kTanhV2>,
+              Layer, SingleLayer, BidirLayer, T>(
           ctx, input, weight_list, pre_state[0], nullptr, sequence_length,
           state[0], nullptr, output, dropout_mask, num_layers, gate_num,
           input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test,
@@ -1014,14 +1014,14 @@ class RNNCPUKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-void create_lstm_value(math::LstmMetaValue<T>* lstm_value) {
+void create_lstm_value(phi::funcs::LstmMetaValue<T>* lstm_value) {
   lstm_value->check_ig = nullptr;
   lstm_value->check_fg = nullptr;
   lstm_value->check_og = nullptr;
 }
 
 template <typename T>
-void create_lstm_grad(math::LstmMetaGrad<T>* lstm_grad) {
+void create_lstm_grad(phi::funcs::LstmMetaGrad<T>* lstm_grad) {
   lstm_grad->check_ig_grad = nullptr;
   lstm_grad->check_fg_grad = nullptr;
   lstm_grad->check_og_grad = nullptr;
@@ -1686,8 +1686,8 @@ struct GRUGradCell : GradCell<T> {
     // zero pre_hidden
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, grad_pre_hidden, static_cast<T>(0.0));
-    math::GRUMetaValue<T> gru_value;
-    math::GRUMetaGrad<T> gru_grad;
+    phi::funcs::GRUMetaValue<T> gru_value;
+    phi::funcs::GRUMetaGrad<T> gru_grad;
     gru_value.gate_value = gate_tensor->data<T>();
     gru_value.prev_out_value = pre_hidden->data<T>();
     gru_value.reset_output_value = state_tensor->data<T>();
@@ -1703,9 +1703,9 @@ struct GRUGradCell : GradCell<T> {
         grad_weight_hh->data<T>() + 2 * frame_size * frame_size;
     gru_grad.bias_hh_grad = grad_bias_hh->data<T>();
 
-    auto act_gate = math::detail::GetActivationType("sigmoid_v2");
-    auto act_node = math::detail::GetActivationType("tanh_v2");
-    math::GRUUnitGradFunctorV2<platform::CPUDeviceContext, T>::compute(
+    auto act_gate = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto act_node = phi::funcs::detail::GetActivationType("tanh_v2");
+    phi::funcs::GRUUnitGradFunctorV2<platform::CPUDeviceContext, T>::compute(
         device_ctx, gru_value, gru_grad, frame_size, batch_size, act_node,
         act_gate);
 
@@ -1738,8 +1738,8 @@ struct LSTMGradCell : GradCell<T> {
       backup_tensor<T>(context, &grad_pre_state_bak, grad_pre_state);
     }
 
-    math::LstmMetaValue<T> lstm_value;
-    math::LstmMetaGrad<T> lstm_grad;
+    phi::funcs::LstmMetaValue<T> lstm_value;
+    phi::funcs::LstmMetaGrad<T> lstm_grad;
     create_lstm_value(&lstm_value);
     create_lstm_grad(&lstm_grad);
     lstm_value.gate_value = gate_tensor->data<T>();
@@ -1755,12 +1755,12 @@ struct LSTMGradCell : GradCell<T> {
     lstm_value.output_value = nullptr;
     lstm_grad.state_active_grad = nullptr;
 
-    auto gate_act = math::detail::GetActivationType("sigmoid_v2");
-    auto state_act = math::detail::GetActivationType("tanh_v2");
-    auto cand_act = math::detail::GetActivationType("tanh_v2");
+    auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2");
+    auto state_act = phi::funcs::detail::GetActivationType("tanh_v2");
+    auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2");
 
     T cell_clip = 0.0;
-    math::LstmUnitGradFunctor<platform::CPUDeviceContext, T>::compute(
+    phi::funcs::LstmUnitGradFunctor<platform::CPUDeviceContext, T>::compute(
         device_ctx, lstm_value, lstm_grad, frame_size, batch_size, cell_clip,
         gate_act, state_act, cand_act, false);
     this->update_pre_hidden_grad(
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index aa4fac16920..8b8697b6df1 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -1,6 +1,10 @@
 add_subdirectory(eigen)
 add_subdirectory(blas)
 add_subdirectory(lapack)
+add_subdirectory(detail)
 
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(sequence2batch)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)
 math_library(concat_and_split_functor DEPS dense_tensor)
diff --git a/paddle/fluid/operators/math/detail/CMakeLists.txt b/paddle/phi/kernels/funcs/detail/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/operators/math/detail/CMakeLists.txt
rename to paddle/phi/kernels/funcs/detail/CMakeLists.txt
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h
similarity index 75%
rename from paddle/fluid/operators/math/detail/activation_functions.h
rename to paddle/phi/kernels/funcs/detail/activation_functions.h
index 1fac60e7cb8..475557f1642 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/phi/kernels/funcs/detail/activation_functions.h
@@ -19,9 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 #define SIGMOID_THRESHOLD_MIN -40.0
@@ -132,25 +131,35 @@ struct Active {
 
 #ifdef PADDLE_WITH_CUDA
 
-static DEVICE Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::SigmoidV2<float>,
-    &forward::Relu<float>,    &forward::Tanh<float>,
-    &forward::TanhV2<float>,  &forward::Identity<float>};
+static DEVICE Active<float>::Act kActFloat[] = {&forward::Sigmoid<float>,
+                                                &forward::SigmoidV2<float>,
+                                                &forward::Relu<float>,
+                                                &forward::Tanh<float>,
+                                                &forward::TanhV2<float>,
+                                                &forward::Identity<float>};
 
 static DEVICE Active<float>::ActGrad kActGradFloat[] = {
-    &backward::Sigmoid<float>, &backward::Sigmoid<float>,
-    &backward::Relu<float>,    &backward::Tanh<float>,
-    &backward::Tanh<float>,    &backward::Identity<float>};
-
-static DEVICE Active<double>::Act kActDouble[] = {
-    &forward::Sigmoid<double>, &forward::SigmoidV2<double>,
-    &forward::Relu<double>,    &forward::Tanh<double>,
-    &forward::TanhV2<double>,  &forward::Identity<double>};
+    &backward::Sigmoid<float>,
+    &backward::Sigmoid<float>,
+    &backward::Relu<float>,
+    &backward::Tanh<float>,
+    &backward::Tanh<float>,
+    &backward::Identity<float>};
+
+static DEVICE Active<double>::Act kActDouble[] = {&forward::Sigmoid<double>,
+                                                  &forward::SigmoidV2<double>,
+                                                  &forward::Relu<double>,
+                                                  &forward::Tanh<double>,
+                                                  &forward::TanhV2<double>,
+                                                  &forward::Identity<double>};
 
 static DEVICE Active<double>::ActGrad kActGradDouble[] = {
-    &backward::Sigmoid<double>, &backward::Sigmoid<double>,
-    &backward::Relu<double>,    &backward::Tanh<double>,
-    &backward::Tanh<double>,    &backward::Identity<double>};
+    &backward::Sigmoid<double>,
+    &backward::Sigmoid<double>,
+    &backward::Relu<double>,
+    &backward::Tanh<double>,
+    &backward::Tanh<double>,
+    &backward::Identity<double>};
 
 namespace forward {
 inline DEVICE float activation(float a, int index) {
@@ -287,13 +296,19 @@ __m256 Identity(const __m256 a, const __m256 b);
 }  // namespace avx
 }  // namespace backward
 
-static Active<__m256>::Act kActAvx[] = {
-    &forward::avx::Sigmoid, &forward::avx::SigmoidV2, &forward::avx::Relu,
-    &forward::avx::Tanh,    &forward::avx::TanhV2,    &forward::avx::Identity};
+static Active<__m256>::Act kActAvx[] = {&forward::avx::Sigmoid,
+                                        &forward::avx::SigmoidV2,
+                                        &forward::avx::Relu,
+                                        &forward::avx::Tanh,
+                                        &forward::avx::TanhV2,
+                                        &forward::avx::Identity};
 
-static Active<__m256>::ActGrad kActGradAvx[] = {
-    &backward::avx::Sigmoid, &backward::avx::Sigmoid, &backward::avx::Relu,
-    &backward::avx::Tanh,    &backward::avx::Tanh,    &backward::avx::Identity};
+static Active<__m256>::ActGrad kActGradAvx[] = {&backward::avx::Sigmoid,
+                                                &backward::avx::Sigmoid,
+                                                &backward::avx::Relu,
+                                                &backward::avx::Tanh,
+                                                &backward::avx::Tanh,
+                                                &backward::avx::Identity};
 
 namespace forward {
 inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
@@ -308,6 +323,5 @@ inline __m256 activation(__m256 a, __m256 b, int index) {
 #endif
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/phi/kernels/funcs/detail/avx_functions.cc
similarity index 87%
rename from paddle/fluid/operators/math/detail/avx_functions.cc
rename to paddle/phi/kernels/funcs/detail/avx_functions.cc
index 89e2c825c24..51af97857df 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/phi/kernels/funcs/detail/avx_functions.cc
@@ -14,12 +14,11 @@ limitations under the License. */
 
 #ifdef __AVX__
 
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/detail/avx_mathfun.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 __m256 Exp(__m256 a) { return exp256_ps(a); }
@@ -77,8 +76,9 @@ namespace backward {
 namespace avx {
 __m256 Relu(const __m256 a, const __m256 b) {
   return _mm256_mul_ps(
-      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                       _mm256_set1_ps(1.0f)));
+      a,
+      _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
+                    _mm256_set1_ps(1.0f)));
 }
 
 __m256 Sigmoid(const __m256 a, const __m256 b) {
@@ -96,8 +96,7 @@ __m256 Identity(const __m256 a, const __m256 b) { return a; }
 }  // namespace backward
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
similarity index 99%
rename from paddle/fluid/operators/math/detail/avx_mathfun.h
rename to paddle/phi/kernels/funcs/detail/avx_mathfun.h
index d7cf91134e4..e5e7388d51d 100644
--- a/paddle/fluid/operators/math/detail/avx_mathfun.h
+++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
@@ -49,9 +49,9 @@ typedef __m256 v8sf;   // vector of 8 float (avx)
 typedef __m256i v8si;  // vector of 8 int   (avx)
 typedef __m128i v4si;  // vector of 8 int   (avx)
 
-#define _PI32AVX_CONST(Name, Val)                                          \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \
-                                                                 Val, Val}
+#define _PI32AVX_CONST(Name, Val)                                 \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+      Val, Val, Val, Val}
 
 _PI32AVX_CONST(1, 1);
 _PI32AVX_CONST(inv1, ~1);
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
similarity index 60%
rename from paddle/fluid/operators/math/detail/gru_cpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
index cbbfbc321b5..cb37daa680e 100644
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -16,24 +16,28 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 using Array1 = Eigen::DSizes<int64_t, 1>;
-template <typename T, int MajorType = Eigen::RowMajor,
+template <typename T,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
 
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group for GRU CPU
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(
-    OpResetOutput op_reset_output, T *gate_value, T *reset_output_value,
-    const T *prev_output_value, int frame_size, ActivationType active_gate,
-    bool old_version = true, const T *reset_bias = nullptr) {
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value,
+                                       T *reset_output_value,
+                                       const T *prev_output_value,
+                                       int frame_size,
+                                       ActivationType active_gate,
+                                       bool old_version = true,
+                                       const T *reset_bias = nullptr) {
   T r_value_update_gate;
   T r_value_reset_gate;
   T r_value_reset_output;
@@ -59,8 +63,12 @@ void hl_naive_gru_forward_reset_output(
       r_prev_out = prev_output_value[i];
     }
 
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate, &r_reset_bias,
+    op_reset_output(&r_value_update_gate,
+                    &r_value_reset_gate,
+                    &r_prev_out,
+                    &r_value_reset_output,
+                    active_gate,
+                    &r_reset_bias,
                     old_version);
 
     update_gate[i] = r_value_update_gate;
@@ -70,10 +78,14 @@ void hl_naive_gru_forward_reset_output(
 }
 
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(
-    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
-    T *output_value, int frame_size, ActivationType active_node,
-    bool origin_mode, bool old_version = true) {
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value,
+                                       const T *prev_output_value,
+                                       T *output_value,
+                                       int frame_size,
+                                       ActivationType active_node,
+                                       bool origin_mode,
+                                       bool old_version = true) {
   T r_value_update_gate;
   T r_value_frame_state;
   T r_prev_out = 0;
@@ -93,8 +105,12 @@ void hl_naive_gru_forward_final_output(
       r_prev_out = prev_output_value[i];
     }
 
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node, origin_mode);
+    op_final_output(&r_value_update_gate,
+                    &r_value_frame_state,
+                    &r_prev_out,
+                    &r_output,
+                    active_node,
+                    origin_mode);
 
     frame_state[i] = r_value_frame_state;
     output_value[i] = r_output;
@@ -103,8 +119,10 @@ void hl_naive_gru_forward_final_output(
 
 template <class OpResetOutput, typename T>
 void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                     T *gate_value, T *reset_output_value,
-                                     const T *prev_output_value, int frame_size,
+                                     T *gate_value,
+                                     T *reset_output_value,
+                                     const T *prev_output_value,
+                                     int frame_size,
                                      ActivationType active_gate,
                                      bool old_version = true,
                                      const T *reset_bias = nullptr) {
@@ -152,8 +170,12 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
           _mm256_loadu_ps((const float *)(reset_output_value + i));
     }
 
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate, &r_reset_bias,
+    op_reset_output(&r_value_update_gate,
+                    &r_value_reset_gate,
+                    &r_prev_out,
+                    &r_value_reset_output,
+                    active_gate,
+                    &r_reset_bias,
                     old_version);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
@@ -167,9 +189,13 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
   if (rest > 0) {
     i = n - block;
 
-    op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last,
-                    &r_prev_out_last, &r_value_reset_output, active_gate,
-                    &r_reset_bias, old_version);
+    op_reset_output(&r_value_update_gate_last,
+                    &r_value_reset_gate_last,
+                    &r_prev_out_last,
+                    &r_value_reset_output,
+                    active_gate,
+                    &r_reset_bias,
+                    old_version);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
                      r_value_update_gate_last);
@@ -183,8 +209,10 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
 
 template <class OpFinalOutput, typename T>
 void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
-                                     T *gate_value, const T *prev_output_value,
-                                     T *output_value, int frame_size,
+                                     T *gate_value,
+                                     const T *prev_output_value,
+                                     T *output_value,
+                                     int frame_size,
                                      ActivationType active_node,
                                      bool origin_mode,
                                      bool old_version = true) {
@@ -226,8 +254,12 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
       r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
     }
 
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node, origin_mode);
+    op_final_output(&r_value_update_gate,
+                    &r_value_frame_state,
+                    &r_prev_out,
+                    &r_output,
+                    active_node,
+                    origin_mode);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                      r_value_frame_state);
@@ -236,8 +268,12 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 
   if (rest > 0) {
     i = n - block;
-    op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
-                    &r_prev_out_last, &r_output, active_node, origin_mode);
+    op_final_output(&r_value_update_gate_last,
+                    &r_value_frame_state_last,
+                    &r_prev_out_last,
+                    &r_output,
+                    active_node,
+                    origin_mode);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                      r_value_frame_state_last);
@@ -248,8 +284,10 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 }
 
 template <typename T>
-inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
-                                   GRUMetaValue<T> value, int frame_size) {
+inline void forward_reset_outputV2(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size) {
   auto &place = *context.eigen_device();
   auto value_reset_gate =
       typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
@@ -259,17 +297,23 @@ inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
       value.reset_output_value, Array1(frame_size));
   auto value_reset_bias =
       typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
-  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
-  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
+  paddle::operators::SigmoidFunctor<T>()(
+      place, value_reset_gate, value_reset_gate);
+  paddle::operators::SigmoidFunctor<T>()(
+      place, value_update_gate, value_update_gate);
   value_reset_output.device(place) =
       (value_reset_output + value_reset_bias) * value_reset_gate;
 }
 
 template <class OpResetOutput, typename T>
 inline void forward_reset_output(
-    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
-    int batch_size, ActivationType active_gate, bool old_version = true,
-    const platform::CPUDeviceContext *context = nullptr) {
+    OpResetOutput op_reset_output,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size,
+    int batch_size,
+    ActivationType active_gate,
+    bool old_version = true,
+    const paddle::platform::CPUDeviceContext *context = nullptr) {
   for (int b = 0; b < batch_size; b++) {
     if (!old_version) {
       // use eigen
@@ -277,15 +321,23 @@ inline void forward_reset_output(
     } else {
       if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
           (sizeof(T) == 4)) {
-        hl_avx_gru_forward_reset_output(
-            op_reset_output, value.gate_value, value.reset_output_value,
-            value.prev_out_value, frame_size, active_gate, old_version,
-            value.reset_bias);
+        hl_avx_gru_forward_reset_output(op_reset_output,
+                                        value.gate_value,
+                                        value.reset_output_value,
+                                        value.prev_out_value,
+                                        frame_size,
+                                        active_gate,
+                                        old_version,
+                                        value.reset_bias);
       } else {
-        hl_naive_gru_forward_reset_output(
-            op_reset_output, value.gate_value, value.reset_output_value,
-            value.prev_out_value, frame_size, active_gate, old_version,
-            value.reset_bias);
+        hl_naive_gru_forward_reset_output(op_reset_output,
+                                          value.gate_value,
+                                          value.reset_output_value,
+                                          value.prev_out_value,
+                                          frame_size,
+                                          active_gate,
+                                          old_version,
+                                          value.reset_bias);
       }
     }
     value.gate_value += frame_size * 3;
@@ -297,8 +349,10 @@ inline void forward_reset_output(
 }
 
 template <typename T>
-inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
-                                   GRUMetaValue<T> value, int frame_size) {
+inline void forward_final_outputV2(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size) {
   auto &place = *context.eigen_device();
   auto value_update_gate = typename EigenVector<T>::Type(
       value.gate_value + frame_size, Array1(frame_size));
@@ -306,7 +360,8 @@ inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
       value.gate_value + 2 * frame_size, Array1(frame_size));
   auto value_output =
       typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
-  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
+  paddle::operators::TanhFunctor<T>()(
+      place, value_frame_state, value_frame_state);
   value_output.device(place) =
       (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
   if (value.prev_out_value) {
@@ -319,10 +374,14 @@ inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
 
 template <class OpFinalOutput, typename T>
 inline void forward_final_output(
-    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
-    int batch_size, ActivationType active_node, bool origin_mode,
+    OpFinalOutput op_final_output,
+    phi::funcs::GRUMetaValue<T> value,
+    int frame_size,
+    int batch_size,
+    ActivationType active_node,
+    bool origin_mode,
     bool old_version = true,
-    const platform::CPUDeviceContext *context = nullptr) {
+    const paddle::platform::CPUDeviceContext *context = nullptr) {
   for (int b = 0; b < batch_size; b++) {
     if (!old_version) {
       // eigen
@@ -330,15 +389,23 @@ inline void forward_final_output(
     } else {
       if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
           (sizeof(T) == 4)) {
-        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+        hl_avx_gru_forward_final_output(op_final_output,
+                                        value.gate_value,
                                         value.prev_out_value,
-                                        value.output_value, frame_size,
-                                        active_node, origin_mode, old_version);
+                                        value.output_value,
+                                        frame_size,
+                                        active_node,
+                                        origin_mode,
+                                        old_version);
       } else {
-        hl_naive_gru_forward_final_output(
-            op_final_output, value.gate_value, value.prev_out_value,
-            value.output_value, frame_size, active_node, origin_mode,
-            old_version);
+        hl_naive_gru_forward_final_output(op_final_output,
+                                          value.gate_value,
+                                          value.prev_out_value,
+                                          value.output_value,
+                                          frame_size,
+                                          active_node,
+                                          origin_mode,
+                                          old_version);
       }
     }
     value.gate_value += frame_size * 3;
@@ -350,9 +417,12 @@ inline void forward_final_output(
 }
 
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                      T *gate_grad, const T *prev_out_value,
-                                      T *prev_out_grad, T *output_grad,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad,
+                                      T *gate_value,
+                                      T *gate_grad,
+                                      const T *prev_out_value,
+                                      T *prev_out_grad,
+                                      T *output_grad,
                                       int frame_size,
                                       ActivationType active_node,
                                       bool origin_mode) {
@@ -379,9 +449,15 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
+    op_state_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_frame_state_value,
+                  &r_frame_state_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_out_grad,
+                  active_node,
+                  origin_mode);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -392,9 +468,12 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
 }
 
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                      T *gate_grad, const T *prev_out_value,
-                                      T *prev_out_grad, T *reset_output_grad,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad,
+                                      T *gate_value,
+                                      T *gate_grad,
+                                      const T *prev_out_value,
+                                      T *prev_out_grad,
+                                      T *reset_output_grad,
                                       int frame_size,
                                       ActivationType active_gate) {
   T r_update_gate_value;
@@ -424,9 +503,14 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_reset_gate_value,
+                  &r_reset_gate_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_reset_output_grad,
+                  active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
@@ -437,10 +521,14 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
 }
 
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                    T *gate_grad, const T *prev_out_value,
-                                    T *prev_out_grad, T *output_grad,
-                                    int frame_size, ActivationType active_node,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad,
+                                    T *gate_value,
+                                    T *gate_grad,
+                                    const T *prev_out_value,
+                                    T *prev_out_grad,
+                                    T *output_grad,
+                                    int frame_size,
+                                    ActivationType active_node,
                                     bool origin_mode) {
 #ifdef __AVX__
   __m256 r_update_gate_value;
@@ -468,9 +556,15 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
+    op_state_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_frame_state_value,
+                  &r_frame_state_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_out_grad,
+                  active_node,
+                  origin_mode);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -482,9 +576,12 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
 }
 
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                    T *gate_grad, const T *prev_out_value,
-                                    T *prev_out_grad, T *reset_output_grad,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad,
+                                    T *gate_value,
+                                    T *gate_grad,
+                                    const T *prev_out_value,
+                                    T *prev_out_grad,
+                                    T *reset_output_grad,
                                     int frame_size,
                                     ActivationType active_gate) {
 #ifdef __AVX__
@@ -516,9 +613,14 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_reset_gate_value,
+                  &r_reset_gate_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_reset_output_grad,
+                  active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
@@ -530,11 +632,16 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
 }
 
 template <class OpGruGrad, typename T>
-inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
-                                  T *gate_grad, const T *prev_out_value,
-                                  T *prev_out_grad, T *reset_output_value,
-                                  T *reset_output_grad, T *output_grad,
-                                  int frame_size, ActivationType active_node,
+inline void hl_naive_gru_backward(OpGruGrad op_gru_grad,
+                                  T *gate_value,
+                                  T *gate_grad,
+                                  const T *prev_out_value,
+                                  T *prev_out_grad,
+                                  T *reset_output_value,
+                                  T *reset_output_grad,
+                                  T *output_grad,
+                                  int frame_size,
+                                  ActivationType active_node,
                                   ActivationType active_gate) {
   T r_value_reset_gate;
   T r_grad_reset_gate;
@@ -573,10 +680,18 @@ inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
       r_grad_reset_output = reset_output_grad[i];
     }
 
-    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
-                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
-                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
-                &r_value_reset_output, &r_grad_reset_output, active_node,
+    op_gru_grad(&r_value_reset_gate,
+                &r_grad_reset_gate,
+                &r_value_update_gate,
+                &r_grad_update_gate,
+                &r_value_frame_state,
+                &r_grad_frame_state,
+                &r_value_prev_out,
+                &r_grad_prev_out,
+                &r_grad_output,
+                &r_value_reset_output,
+                &r_grad_reset_output,
+                active_node,
                 active_gate);
 
     reset_gate_grad[i] = r_grad_reset_gate;
@@ -592,11 +707,16 @@ inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
 }
 
 template <class OpGruGrad, typename T>
-inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
-                                T *gate_grad, const T *prev_out_value,
-                                T *prev_out_grad, T *reset_output_value,
-                                T *reset_output_grad, T *output_grad,
-                                int frame_size, ActivationType active_node,
+inline void hl_avx_gru_backward(OpGruGrad op_gru_grad,
+                                T *gate_value,
+                                T *gate_grad,
+                                const T *prev_out_value,
+                                T *prev_out_grad,
+                                T *reset_output_value,
+                                T *reset_output_grad,
+                                T *output_grad,
+                                int frame_size,
+                                ActivationType active_node,
                                 ActivationType active_gate) {
 #ifdef __AVX__
   __m256 r_value_reset_gate;
@@ -639,10 +759,18 @@ inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
       r_grad_reset_output = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
     }
 
-    op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate,
-                &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state,
-                &r_value_prev_out, &r_grad_prev_out, &r_grad_output,
-                &r_value_reset_output, &r_grad_reset_output, active_node,
+    op_gru_grad(&r_value_reset_gate,
+                &r_grad_reset_gate,
+                &r_value_update_gate,
+                &r_grad_update_gate,
+                &r_value_frame_state,
+                &r_grad_frame_state,
+                &r_value_prev_out,
+                &r_grad_prev_out,
+                &r_grad_output,
+                &r_value_reset_output,
+                &r_grad_reset_output,
+                active_node,
                 active_gate);
 
     reset_gate_grad[i] = r_grad_reset_gate;
@@ -660,20 +788,33 @@ inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value,
 
 template <class OpStateGrad, typename T>
 inline void backward_state_grad(OpStateGrad op_state_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
-                                ActivationType active_node, bool origin_mode) {
+                                phi::funcs::GRUMetaValue<T> value,
+                                phi::funcs::GRUMetaGrad<T> grad,
+                                int frame_size,
+                                int batch_size,
+                                ActivationType active_node,
+                                bool origin_mode) {
   for (int b = 0; b < batch_size; b++) {
     if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value,
-                                     grad.gate_grad, value.prev_out_value,
-                                     grad.prev_out_grad, grad.output_grad,
-                                     frame_size, active_node, origin_mode);
+      hl_avx_gru_backward_state_grad(op_state_grad,
+                                     value.gate_value,
+                                     grad.gate_grad,
+                                     value.prev_out_value,
+                                     grad.prev_out_grad,
+                                     grad.output_grad,
+                                     frame_size,
+                                     active_node,
+                                     origin_mode);
     } else {
-      hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value,
-                                       grad.gate_grad, value.prev_out_value,
-                                       grad.prev_out_grad, grad.output_grad,
-                                       frame_size, active_node, origin_mode);
+      hl_naive_gru_backward_state_grad(op_state_grad,
+                                       value.gate_value,
+                                       grad.gate_grad,
+                                       value.prev_out_value,
+                                       grad.prev_out_grad,
+                                       grad.output_grad,
+                                       frame_size,
+                                       active_node,
+                                       origin_mode);
     }
 
     value.gate_value += frame_size * 3;
@@ -691,18 +832,30 @@ inline void backward_state_grad(OpStateGrad op_state_grad,
 
 template <class OpResetGrad, typename T>
 inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
+                                phi::funcs::GRUMetaValue<T> value,
+                                phi::funcs::GRUMetaGrad<T> grad,
+                                int frame_size,
+                                int batch_size,
                                 ActivationType active_gate) {
   for (int b = 0; b < batch_size; b++) {
     if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+      hl_avx_gru_backward_reset_grad(op_reset_grad,
+                                     value.gate_value,
+                                     grad.gate_grad,
+                                     value.prev_out_value,
+                                     grad.prev_out_grad,
+                                     grad.reset_output_grad,
+                                     frame_size,
+                                     active_gate);
     } else {
-      hl_naive_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+      hl_naive_gru_backward_reset_grad(op_reset_grad,
+                                       value.gate_value,
+                                       grad.gate_grad,
+                                       value.prev_out_value,
+                                       grad.prev_out_grad,
+                                       grad.reset_output_grad,
+                                       frame_size,
+                                       active_gate);
     }
 
     value.gate_value += frame_size * 3;
@@ -719,8 +872,9 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
 }
 
 template <typename T>
-inline void gru_backward(const platform::CPUDeviceContext &context,
-                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+inline void gru_backward(const paddle::platform::CPUDeviceContext &context,
+                         phi::funcs::GRUMetaValue<T> value,
+                         phi::funcs::GRUMetaGrad<T> grad,
                          int frame_size) {
   auto &place = *context.eigen_device();
 
@@ -747,13 +901,19 @@ inline void gru_backward(const platform::CPUDeviceContext &context,
   if (value.prev_out_value) {
     auto value_prev_out = typename EigenVector<T>::ConstType(
         value.prev_out_value, Array1(frame_size));
-    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
-                            (value_prev_out - value_frame_state) * grad_output,
-                            grad_update_gate);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place,
+        1 /*useless*/,
+        value_update_gate,
+        (value_prev_out - value_frame_state) * grad_output,
+        grad_update_gate);
   } else {
-    SigmoidGradFunctor<T>()(
-        place, 1 /*useless*/, value_update_gate,
-        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place,
+        1 /*useless*/,
+        value_update_gate,
+        static_cast<T>(-1) * value_frame_state * grad_output,
+        grad_update_gate);
   }
   if (grad.prev_out_grad) {
     auto grad_prev_out =
@@ -761,11 +921,16 @@ inline void gru_backward(const platform::CPUDeviceContext &context,
     grad_prev_out.device(place) =
         grad_prev_out + grad_output * value_update_gate;
   }
-  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
-                       grad_output * (static_cast<T>(1.0) - value_update_gate),
-                       grad_frame_state);
-  SigmoidGradFunctor<T>()(
-      place, 1 /*useless*/, value_reset_gate,
+  paddle::operators::TanhGradFunctor<T>()(
+      place,
+      1 /*useless*/,
+      value_frame_state,
+      grad_output * (static_cast<T>(1.0) - value_update_gate),
+      grad_frame_state);
+  paddle::operators::SigmoidGradFunctor<T>()(
+      place,
+      1 /*useless*/,
+      value_reset_gate,
       value_reset_output / value_reset_gate * grad_frame_state,
       grad_reset_gate);
   if (value.prev_out_value && grad.prev_out_grad) {
@@ -774,10 +939,13 @@ inline void gru_backward(const platform::CPUDeviceContext &context,
 }
 
 template <class OpGruGrad, typename T>
-inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
-                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
-                             GRUMetaGrad<T> grad, int frame_size,
-                             int batch_size, ActivationType active_node,
+inline void cpu_gru_backward(const paddle::platform::CPUDeviceContext &context,
+                             OpGruGrad op_gru_grad,
+                             phi::funcs::GRUMetaValue<T> value,
+                             phi::funcs::GRUMetaGrad<T> grad,
+                             int frame_size,
+                             int batch_size,
+                             ActivationType active_node,
                              ActivationType active_gate) {
   for (int b = 0; b < batch_size; ++b) {
     // eigen
@@ -801,6 +969,5 @@ inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
 #endif  // @} End Group for GRU CPU
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
similarity index 74%
rename from paddle/fluid/operators/math/detail/gru_gpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index 75d4809a462..6657417beac 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -14,14 +14,13 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 /*
@@ -30,9 +29,11 @@ namespace detail {
  */
 template <class OpResetOutput, bool is_batch, typename T>
 __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
-                                        T *gate_value, T *reset_output_value,
+                                        T *gate_value,
+                                        T *reset_output_value,
                                         const T *prev_output_value,
-                                        int frame_size, int batch_size,
+                                        int frame_size,
+                                        int batch_size,
                                         ActivationType active_gate) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
@@ -55,8 +56,11 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                  &r_value_reset_output, active_gate);
+  op_reset_output(&r_value_update_gate,
+                  &r_value_reset_gate,
+                  &r_prev_out,
+                  &r_value_reset_output,
+                  active_gate);
 
   gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
   gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
@@ -68,10 +72,14 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
  * grid(frame_blocks, batch_blocks)
  */
 template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(
-    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
-    T *output_value, int frame_size, int batch_size, ActivationType active_node,
-    bool origin_mode) {
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value,
+                                        const T *prev_output_value,
+                                        T *output_value,
+                                        int frame_size,
+                                        int batch_size,
+                                        ActivationType active_node,
+                                        bool origin_mode) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
   int batch_idx = 0;
@@ -92,8 +100,12 @@ __global__ void KeGruForwardFinalOutput(
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                  &r_output, active_node, origin_mode);
+  op_final_output(&r_value_update_gate,
+                  &r_value_frame_state,
+                  &r_prev_out,
+                  &r_output,
+                  active_node,
+                  origin_mode);
 
   gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
   output_value[frame_idx] = r_output;
@@ -106,7 +118,8 @@ __global__ void KeGruForwardFinalOutput(
 template <class T, int Tiled_size>
 __global__ void KeFastCollectiveGruGate(T *gate_value,
                                         const T *prev_output_value,
-                                        const T *gate_weight, T *reset_output,
+                                        const T *gate_weight,
+                                        T *reset_output,
                                         int frame_size,
                                         ActivationType active_node) {
   T xt_0 = 0.0f;
@@ -164,9 +177,12 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
  */
 template <class T, int Tiled_size>
 __global__ void KeFastCollectiveGruOut(const T *gate_weight,
-                                       const T *prev_out_value, T *output_value,
-                                       T *gate_value, T *reset_value,
-                                       int frame_size, ActivationType act_node,
+                                       const T *prev_out_value,
+                                       T *output_value,
+                                       T *gate_value,
+                                       T *reset_value,
+                                       int frame_size,
+                                       ActivationType act_node,
                                        bool origin_mode) {
   int COL = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -221,10 +237,14 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
  * grid(frame_blocks, batch_blocks)
  */
 template <class OpStateGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gate_grad, const T *prev_out_value,
-                                       T *prev_out_grad, T *output_grad,
-                                       int frame_size, int batch_size,
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad,
+                                       T *gate_value,
+                                       T *gate_grad,
+                                       const T *prev_out_value,
+                                       T *prev_out_grad,
+                                       T *output_grad,
+                                       int frame_size,
+                                       int batch_size,
                                        ActivationType active_node,
                                        bool origin_mode) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -254,9 +274,15 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
     r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value,
-                &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad,
-                &r_out_grad, active_node, origin_mode);
+  op_state_grad(&r_update_gate_value,
+                &r_update_gate_grad,
+                &r_frame_state_value,
+                &r_frame_state_grad,
+                &r_prev_out_value,
+                &r_prev_out_grad,
+                &r_out_grad,
+                active_node,
+                origin_mode);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
@@ -270,10 +296,14 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
  * grid(frame_blocks, batch_blocks)
  */
 template <class OpResetGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gate_grad, const T *prev_out_value,
-                                       T *prev_out_grad, T *reset_output_grad,
-                                       int frame_size, int batch_size,
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad,
+                                       T *gate_value,
+                                       T *gate_grad,
+                                       const T *prev_out_value,
+                                       T *prev_out_grad,
+                                       T *reset_output_grad,
+                                       int frame_size,
+                                       int batch_size,
                                        ActivationType active_gate) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
@@ -302,9 +332,14 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
     r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value,
-                &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad,
-                &r_reset_output_grad, active_gate);
+  op_reset_grad(&r_update_gate_value,
+                &r_update_gate_grad,
+                &r_reset_gate_value,
+                &r_reset_gate_grad,
+                &r_prev_out_value,
+                &r_prev_out_grad,
+                &r_reset_output_grad,
+                active_gate);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
@@ -313,6 +348,5 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
   }
 }
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h
similarity index 64%
rename from paddle/fluid/operators/math/detail/gru_kernel.h
rename to paddle/phi/kernels/funcs/detail/gru_kernel.h
index 082c2a180da..db53fc4576d 100644
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
 // TODO(guosheng): refine code style in gru_kernel
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 namespace forward {
@@ -28,8 +27,10 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
-                             T *prev_out, T *value_reset_output,
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *value_reset_gate,
+                             T *prev_out,
+                             T *value_reset_output,
                              ActivationType act_gate,
                              T *value_reset_bias = nullptr,
                              bool old_version = true) {
@@ -48,7 +49,8 @@ class gru_resetOutput {
 #else
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_reset_gate, __m256 *prev_out,
+                             __m256 *value_reset_gate,
+                             __m256 *prev_out,
                              __m256 *value_reset_output,
                              ActivationType act_gate,
                              __m256 *value_reset_bias = nullptr,
@@ -71,9 +73,12 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
-                             T *prev_out, T *value_output,
-                             ActivationType act_input, bool origin_mode) {
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *value_frame_state,
+                             T *prev_out,
+                             T *value_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
     *value_frame_state = activation(*value_frame_state, act_input);
     if (origin_mode) {
       *value_output = ((*value_update_gate) * (*prev_out)) +
@@ -90,8 +95,10 @@ class gru_finalOutput {
 #else
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_frame_state, __m256 *prev_out,
-                             __m256 *value_output, ActivationType act_input,
+                             __m256 *value_frame_state,
+                             __m256 *prev_out,
+                             __m256 *value_output,
+                             ActivationType act_input,
                              bool origin_mode) {
     *value_frame_state = activation(*value_frame_state, act_input);
     if (origin_mode) {
@@ -116,10 +123,14 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
-                             T *value_frame_state, T *grad_frame_state,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_output, ActivationType act_input,
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_frame_state,
+                             T *grad_frame_state,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_output,
+                             ActivationType act_input,
                              bool origin_mode) {
     if (origin_mode) {
       *grad_update_gate =
@@ -127,14 +138,15 @@ class gru_stateGrad {
       *grad_prev_out += (*grad_output * (*value_update_gate));
       *grad_frame_state = activation(
           *grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
-          *value_frame_state, act_input);
+          *value_frame_state,
+          act_input);
     } else {
       *grad_update_gate =
           (*grad_output) * ((*value_frame_state) - (*value_prev_out));
       *grad_prev_out +=
           (*grad_output * (static_cast<T>(1.0) - *value_update_gate));
-      *grad_frame_state = activation(*grad_output * (*value_update_gate),
-                                     *value_frame_state, act_input);
+      *grad_frame_state = activation(
+          *grad_output * (*value_update_gate), *value_frame_state, act_input);
     }
   }
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU state grad
@@ -145,28 +157,35 @@ class gru_stateGrad {
   HOSTDEVICE void operator()(__m256 *value_update_gate,
                              __m256 *grad_update_gate,
                              __m256 *value_frame_state,
-                             __m256 *grad_frame_state, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_output,
-                             ActivationType act_input, bool origin_mode) {
+                             __m256 *grad_frame_state,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
     if (origin_mode) {
       *grad_update_gate = _mm256_mul_ps(
           *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state));
       *grad_prev_out = _mm256_add_ps(
           *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
       *grad_frame_state = activation(
-          _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f),
-                                                    *value_update_gate)),
-          *value_frame_state, act_input);
+          _mm256_mul_ps(
+              *grad_output,
+              _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
+          *value_frame_state,
+          act_input);
     } else {
       *grad_update_gate = _mm256_mul_ps(
           *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out));
       *grad_prev_out = _mm256_add_ps(
           *grad_prev_out,
-          _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f),
-                                                    *value_update_gate)));
+          _mm256_mul_ps(
+              *grad_output,
+              _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)));
       *grad_frame_state =
           activation(_mm256_mul_ps(*grad_output, *value_update_gate),
-                     *value_frame_state, act_input);
+                     *value_frame_state,
+                     act_input);
     }
   }
 #endif
@@ -176,10 +195,14 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
-                             T *value_reset_gate, T *grad_reset_gate,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_reset_output, ActivationType act_gate) {
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_reset_gate,
+                             T *grad_reset_gate,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_reset_output,
+                             ActivationType act_gate) {
     *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
     *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
     *grad_update_gate =
@@ -193,9 +216,12 @@ class gru_resetGrad {
 #else
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *grad_update_gate, __m256 *value_reset_gate,
-                             __m256 *grad_reset_gate, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_reset_output,
+                             __m256 *grad_update_gate,
+                             __m256 *value_reset_gate,
+                             __m256 *grad_reset_gate,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_reset_output,
                              ActivationType act_gate) {
     *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
     *grad_prev_out = _mm256_add_ps(
@@ -211,23 +237,31 @@ class gru_resetGrad {
 template <typename T>
 class gru {
  public:
-  HOSTDEVICE void operator()(T *value_reset_gate, T *grad_reset_gate,
-                             T *value_update_gate, T *grad_update_gate,
-                             T *value_frame_state, T *grad_frame_state,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_output, T *value_reset_output,
-                             T *grad_reset_output, ActivationType act_node,
+  HOSTDEVICE void operator()(T *value_reset_gate,
+                             T *grad_reset_gate,
+                             T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_frame_state,
+                             T *grad_frame_state,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_output,
+                             T *value_reset_output,
+                             T *grad_reset_output,
+                             ActivationType act_node,
                              ActivationType act_gate) {
     *grad_update_gate =
         activation((*grad_output) * ((*value_prev_out) - (*value_frame_state)),
-                   (*value_update_gate), act_gate);
+                   (*value_update_gate),
+                   act_gate);
     *grad_prev_out += (*grad_output * (*value_update_gate));
     *grad_frame_state =
         activation(*grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
-                   *value_frame_state, act_node);
+                   *value_frame_state,
+                   act_node);
     T reset_output = (*value_reset_output) / (*value_reset_gate);
-    *grad_reset_gate = activation(reset_output * (*grad_frame_state),
-                                  *value_reset_gate, act_gate);
+    *grad_reset_gate = activation(
+        reset_output * (*grad_frame_state), *value_reset_gate, act_gate);
     *grad_reset_output = (*value_reset_gate) * (*grad_frame_state);
   }
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU CPU
@@ -235,29 +269,36 @@ class gru {
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_reset_gate, __m256 *grad_reset_gate,
+  HOSTDEVICE void operator()(__m256 *value_reset_gate,
+                             __m256 *grad_reset_gate,
                              __m256 *value_update_gate,
                              __m256 *grad_update_gate,
                              __m256 *value_frame_state,
-                             __m256 *grad_frame_state, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_output,
+                             __m256 *grad_frame_state,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_output,
                              __m256 *value_reset_output,
-                             __m256 *grad_reset_output, ActivationType act_node,
+                             __m256 *grad_reset_output,
+                             ActivationType act_node,
                              ActivationType act_gate) {
     *grad_update_gate = activation(
         _mm256_mul_ps(*grad_output,
                       _mm256_sub_ps(*value_prev_out, *value_frame_state)),
-        *value_update_gate, act_gate);
+        *value_update_gate,
+        act_gate);
     *grad_prev_out = _mm256_add_ps(
         *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
     *grad_frame_state = activation(
         _mm256_mul_ps(*grad_output,
                       _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
-        *value_frame_state, act_node);
+        *value_frame_state,
+        act_node);
     __m256 reset_output = _mm256_div_ps(*value_reset_output, *value_reset_gate);
     *grad_reset_gate =
         activation(_mm256_mul_ps(reset_output, *grad_frame_state),
-                   *value_reset_gate, act_gate);
+                   *value_reset_gate,
+                   act_gate);
     *grad_reset_output = _mm256_mul_ps(*value_reset_gate, *grad_frame_state);
   }
 #endif
@@ -267,6 +308,5 @@ class gru {
 }  // namespace backward
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
similarity index 65%
rename from paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
index 169c5488bb5..10dbf27d348 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 
 #if defined(_WIN32)
 #if defined(__AVX2__) || defined(__AVX__)
@@ -25,21 +25,23 @@ inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
 #endif
 #endif
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 using Array1 = Eigen::DSizes<int64_t, 1>;
-template <typename T, int MajorType = Eigen::RowMajor,
+template <typename T,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
 
 #if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM CPU
 
 template <class T, class Op>
-void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, T cell_clip,
+void naive_lstm_forward_one_sequence(Op op,
+                                     phi::funcs::LstmMetaValue<T> value,
+                                     int frame_size,
+                                     T cell_clip,
                                      ActivationType active_node,
                                      ActivationType active_gate,
                                      ActivationType active_state,
@@ -79,9 +81,21 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
       r_prev_state = value.prev_state_value[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -94,9 +108,12 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frame_size,
-                                      T cell_clip, ActivationType active_node,
+void naive_lstm_backward_one_sequence(Op op,
+                                      phi::funcs::LstmMetaValue<T> value,
+                                      phi::funcs::LstmMetaGrad<T> grad,
+                                      int frame_size,
+                                      T cell_clip,
+                                      ActivationType active_node,
                                       ActivationType active_gate,
                                       ActivationType active_state,
                                       bool old_api_version) {
@@ -157,11 +174,30 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
       r_prev_state = value.prev_state_value[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
-       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
-       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
-       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -179,8 +215,10 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, T cell_clip,
+void avx_lstm_forward_one_sequence(Op op,
+                                   phi::funcs::LstmMetaValue<T> value,
+                                   int frame_size,
+                                   T cell_clip,
                                    ActivationType active_node,
                                    ActivationType active_gate,
                                    ActivationType active_state,
@@ -226,9 +264,21 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
           (reinterpret_cast<__m256 const *>(value.prev_state_value))[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -242,9 +292,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frame_size,
-                                    T cell_clip, ActivationType active_node,
+void avx_lstm_backward_one_sequence(Op op,
+                                    phi::funcs::LstmMetaValue<T> value,
+                                    phi::funcs::LstmMetaGrad<T> grad,
+                                    int frame_size,
+                                    T cell_clip,
+                                    ActivationType active_node,
                                     ActivationType active_gate,
                                     ActivationType active_state,
                                     bool old_api_version) {
@@ -311,11 +364,30 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
           (reinterpret_cast<__m256 const *>(value.prev_state_value))[i];
     }
 
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
-       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
-       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
-       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       &cell_clip, active_node, active_gate, active_state);
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -338,8 +410,10 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 }
 
 template <class T>
-void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context,
-                                     LstmMetaValue<T> value, int frame_size) {
+void eigen_lstm_forward_one_sequence(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::LstmMetaValue<T> value,
+    int frame_size) {
   auto eigen_value_ig =
       typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
   auto eigen_value_fg = typename EigenVector<T>::Type(
@@ -356,10 +430,10 @@ void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context,
       typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
 
   auto &place = *context.eigen_device();
-  TanhFunctor<T>()(place, eigen_value_in, eigen_value_in);
-  SigmoidFunctor<T>()(place, eigen_value_ig, eigen_value_ig);
-  SigmoidFunctor<T>()(place, eigen_value_fg, eigen_value_fg);
-  SigmoidFunctor<T>()(place, eigen_value_og, eigen_value_og);
+  paddle::operators::TanhFunctor<T>()(place, eigen_value_in, eigen_value_in);
+  paddle::operators::SigmoidFunctor<T>()(place, eigen_value_ig, eigen_value_ig);
+  paddle::operators::SigmoidFunctor<T>()(place, eigen_value_fg, eigen_value_fg);
+  paddle::operators::SigmoidFunctor<T>()(place, eigen_value_og, eigen_value_og);
 
   eigen_state.device(place) = eigen_value_in * eigen_value_ig;
   if (value.prev_state_value) {
@@ -368,14 +442,16 @@ void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context,
     eigen_state.device(place) = eigen_state + eigen_prev_state * eigen_value_fg;
   }
 
-  TanhFunctor<T>()(place, eigen_state, eigen_state_act);
+  paddle::operators::TanhFunctor<T>()(place, eigen_state, eigen_state_act);
   eigen_output.device(place) = eigen_value_og * eigen_state_act;
 }
 
 template <class T>
-void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context,
-                                      LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frame_size) {
+void eigen_lstm_backward_one_sequence(
+    const paddle::platform::CPUDeviceContext &context,
+    phi::funcs::LstmMetaValue<T> value,
+    phi::funcs::LstmMetaGrad<T> grad,
+    int frame_size) {
   auto eigen_value_ig =
       typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
   auto eigen_value_fg = typename EigenVector<T>::Type(
@@ -401,23 +477,38 @@ void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context,
       typename EigenVector<T>::Type(grad.state_grad, Array1(frame_size));
 
   auto &place = *context.eigen_device();
-  SigmoidGradFunctor<T>()(place, 1 /*useless*/, eigen_value_og,
-                          eigen_grad_output * eigen_state_act, eigen_grad_og);
+  paddle::operators::SigmoidGradFunctor<T>()(
+      place,
+      1 /*useless*/,
+      eigen_value_og,
+      eigen_grad_output * eigen_state_act,
+      eigen_grad_og);
   eigen_grad_state.device(place) =
       eigen_grad_state +
       eigen_grad_output * eigen_value_og *
           (static_cast<T>(1) - eigen_state_act * eigen_state_act);
-  TanhGradFunctor<T>()(place, 1, eigen_value_in,
-                       eigen_grad_state * eigen_value_ig, eigen_grad_in);
-  SigmoidGradFunctor<T>()(place, 1, eigen_value_ig,
-                          eigen_grad_state * eigen_value_in, eigen_grad_ig);
+  paddle::operators::TanhGradFunctor<T>()(place,
+                                          1,
+                                          eigen_value_in,
+                                          eigen_grad_state * eigen_value_ig,
+                                          eigen_grad_in);
+  paddle::operators::SigmoidGradFunctor<T>()(place,
+                                             1,
+                                             eigen_value_ig,
+                                             eigen_grad_state * eigen_value_in,
+                                             eigen_grad_ig);
   if (value.prev_state_value) {
     auto eigen_prev_state = typename EigenVector<T>::ConstType(
         value.prev_state_value, Array1(frame_size));
-    SigmoidGradFunctor<T>()(place, 1, eigen_value_fg,
-                            eigen_grad_state * eigen_prev_state, eigen_grad_fg);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place,
+        1,
+        eigen_value_fg,
+        eigen_grad_state * eigen_prev_state,
+        eigen_grad_fg);
   } else {
-    SigmoidGradFunctor<T>()(place, 1, eigen_value_fg, 0, eigen_grad_fg);
+    paddle::operators::SigmoidGradFunctor<T>()(
+        place, 1, eigen_value_fg, 0, eigen_grad_fg);
   }
   if (grad.prev_state_grad) {
     auto eigen_grad_pre_state =
@@ -427,42 +518,74 @@ void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context,
 }
 
 template <class T, class Op>
-void cpu_lstm_forward(const platform::CPUDeviceContext &context, Op op,
-                      LstmMetaValue<T> value, int frame_size, T cell_clip,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state, bool old_api_version) {
+void cpu_lstm_forward(const paddle::platform::CPUDeviceContext &context,
+                      Op op,
+                      phi::funcs::LstmMetaValue<T> value,
+                      int frame_size,
+                      T cell_clip,
+                      ActivationType active_node,
+                      ActivationType active_gate,
+                      ActivationType active_state,
+                      bool old_api_version) {
   if (!old_api_version) {
     eigen_lstm_forward_one_sequence<T>(context, value, frame_size);
   } else {
     if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-      avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
-                                       active_node, active_gate, active_state,
+      avx_lstm_forward_one_sequence<T>(op,
+                                       value,
+                                       frame_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state,
                                        old_api_version);
     } else {
-      naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
-                                         active_node, active_gate, active_state,
+      naive_lstm_forward_one_sequence<T>(op,
+                                         value,
+                                         frame_size,
+                                         cell_clip,
+                                         active_node,
+                                         active_gate,
+                                         active_state,
                                          old_api_version);
     }
   }
 }
 
 template <class T, class Op>
-void cpu_lstm_backward(const platform::CPUDeviceContext &context, Op op,
-                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, T cell_clip, ActivationType active_node,
-                       ActivationType active_gate, ActivationType active_state,
+void cpu_lstm_backward(const paddle::platform::CPUDeviceContext &context,
+                       Op op,
+                       phi::funcs::LstmMetaValue<T> value,
+                       phi::funcs::LstmMetaGrad<T> grad,
+                       int frame_size,
+                       T cell_clip,
+                       ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state,
                        bool old_api_version) {
   if (!old_api_version) {
     eigen_lstm_backward_one_sequence<T>(context, value, grad, frame_size);
   } else {
     if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-      avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
-                                        active_node, active_gate, active_state,
+      avx_lstm_backward_one_sequence<T>(op,
+                                        value,
+                                        grad,
+                                        frame_size,
+                                        cell_clip,
+                                        active_node,
+                                        active_gate,
+                                        active_state,
                                         old_api_version);
     } else {
-      naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
-                                          cell_clip, active_node, active_gate,
-                                          active_state, old_api_version);
+      naive_lstm_backward_one_sequence<T>(op,
+                                          value,
+                                          grad,
+                                          frame_size,
+                                          cell_clip,
+                                          active_node,
+                                          active_gate,
+                                          active_state,
+                                          old_api_version);
     }
   }
 }
@@ -470,6 +593,5 @@ void cpu_lstm_backward(const platform::CPUDeviceContext &context, Op op,
 #endif  // @{ End Group LSTM CPU
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
similarity index 68%
rename from paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
rename to paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
index 851a62dbe9a..6d4c430d9e6 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -15,14 +15,13 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 /*
@@ -30,8 +29,11 @@ namespace detail {
  * grid(frame_blocks, batch_blocks)
  */
 template <class T, class Op, bool is_batch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, T cell_clip,
+__global__ void KeLstmForward(Op op,
+                              phi::funcs::LstmMetaValue<T> value,
+                              int frame_size,
+                              int batch_size,
+                              T cell_clip,
                               ActivationType active_node,
                               ActivationType active_gate,
                               ActivationType active_state) {
@@ -71,9 +73,21 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
     r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-     &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     &cell_clip, active_node, active_gate, active_state);
+  op(&r_value_in,
+     &r_value_ig,
+     &r_value_fg,
+     &r_value_og,
+     &r_prev_state,
+     &r_state,
+     &r_state_atv,
+     &r_out,
+     &r_checkI,
+     &r_checkF,
+     &r_checkO,
+     &cell_clip,
+     active_node,
+     active_gate,
+     active_state);
 
   value.gate_value[frame_idx] = r_value_in;
   value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -90,9 +104,12 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
  * grid(frame_blocks, batch_blocks)
  */
 template <class T, class Op, bool is_batch>
-__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, T cell_clip,
+__global__ void KeLstmBackward(Op op,
+                               phi::funcs::LstmMetaValue<T> value,
+                               phi::funcs::LstmMetaGrad<T> grad,
+                               int frame_size,
+                               int batch_size,
+                               T cell_clip,
                                ActivationType active_node,
                                ActivationType active_gate,
                                ActivationType active_state) {
@@ -147,11 +164,30 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
     r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
-     &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
-     &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
-     active_node, active_gate, active_state);
+  op(&r_value_in,
+     &r_value_ig,
+     &r_value_fg,
+     &r_value_og,
+     &r_grad_in,
+     &r_grad_ig,
+     &r_grad_fg,
+     &r_grad_og,
+     &r_prev_state,
+     &r_prev_state_grad,
+     &r_state,
+     &r_state_grad,
+     &r_state_atv,
+     &r_output_grad,
+     &r_checkI,
+     &r_checkF,
+     &r_checkO,
+     &r_checkIGrad,
+     &r_checkFGrad,
+     &r_checkOGrad,
+     &cell_clip,
+     active_node,
+     active_gate,
+     active_state);
 
   grad.gate_grad[frame_idx] = r_grad_in;
   grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
@@ -185,10 +221,15 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 }
 
 template <class T, class Op>
-void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, ActivationType active_node,
-                      ActivationType active_gate, ActivationType active_state) {
+void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
+                      Op op,
+                      phi::funcs::LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      ActivationType active_node,
+                      ActivationType active_gate,
+                      ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
@@ -203,25 +244,45 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
   }
 
   auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+      reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context)
+          .stream();
   if (batch_size == 1) {
-    KeLstmForward<T, Op,
+    KeLstmForward<T,
+                  Op,
                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
+        op,
+        value,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
         active_state);
   } else {
-    KeLstmForward<T, Op,
+    KeLstmForward<T,
+                  Op,
                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
+        op,
+        value,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
         active_state);
   }
 }
 
 template <class T, class Op>
-void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
-                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size, T cell_clip,
-                       ActivationType active_node, ActivationType active_gate,
+void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
+                       Op op,
+                       phi::funcs::LstmMetaValue<T> value,
+                       phi::funcs::LstmMetaGrad<T> grad,
+                       int frame_size,
+                       int batch_size,
+                       T cell_clip,
+                       ActivationType active_node,
+                       ActivationType active_gate,
                        ActivationType active_state) {
   dim3 threads;
   dim3 grid;
@@ -237,21 +298,37 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
   }
 
   auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+      reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context)
+          .stream();
   if (batch_size == 1) {
-    KeLstmBackward<T, Op,
+    KeLstmBackward<T,
+                   Op,
                    /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, cell_clip, active_node,
-        active_gate, active_state);
+        op,
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
+        active_state);
   } else {
-    KeLstmBackward<T, Op,
+    KeLstmBackward<T,
+                   Op,
                    /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, cell_clip, active_node,
-        active_gate, active_state);
+        op,
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        cell_clip,
+        active_node,
+        active_gate,
+        active_state);
   }
 }
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
similarity index 59%
rename from paddle/fluid/operators/math/detail/lstm_kernel.h
rename to paddle/phi/kernels/funcs/detail/lstm_kernel.h
index 2d4e7dd59fb..8b429264125 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
@@ -14,12 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 namespace detail {
 
 namespace forward {
@@ -27,9 +26,18 @@ namespace forward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
-                             T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO, T *cell_clip,
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *prev_state,
+                             T *state,
+                             T *state_atv,
+                             T *output,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -57,11 +65,18 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
 
-  HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig,
-                             __m256 *value_fg, __m256 *value_og,
-                             __m256 *prev_state, __m256 *state,
-                             __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO, T *cell_clip,
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *prev_state,
+                             __m256 *state,
+                             __m256 *state_atv,
+                             __m256 *output,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -97,12 +112,27 @@ namespace backward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
-                             T *grad_in, T *grad_ig, T *grad_fg, T *grad_og,
-                             T *prev_state, T *prev_state_grad, T *state,
-                             T *state_grad, T *state_atv, T *output_grad,
-                             T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad, T *cell_clip,
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *grad_in,
+                             T *grad_ig,
+                             T *grad_fg,
+                             T *grad_og,
+                             T *prev_state,
+                             T *prev_state_grad,
+                             T *state,
+                             T *state_grad,
+                             T *state_atv,
+                             T *output_grad,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *checkIGrad,
+                             T *checkFGrad,
+                             T *checkOGrad,
+                             T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -138,17 +168,32 @@ class lstm {
 #else
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
-  HOSTDEVICE void operator()(
-      __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og,
-      __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og,
-      __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
-      __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
-      __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
-      ActivationType active_node, ActivationType active_gate,
-      ActivationType active_state) {
-    *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
-                          active_gate);
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *grad_in,
+                             __m256 *grad_ig,
+                             __m256 *grad_fg,
+                             __m256 *grad_og,
+                             __m256 *prev_state,
+                             __m256 *prev_state_grad,
+                             __m256 *state,
+                             __m256 *state_grad,
+                             __m256 *state_atv,
+                             __m256 *output_grad,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             __m256 *checkIGrad,
+                             __m256 *checkFGrad,
+                             __m256 *checkOGrad,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *grad_og = activation(
+        _mm256_mul_ps(*output_grad, *state_atv), *value_og, active_gate);
     if (*cell_clip > 0.0f) {
       T *state_ = reinterpret_cast<T *>(state);
       if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
@@ -156,18 +201,19 @@ class lstm {
       } else {
         *state_grad =
             _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
-                                     *state_atv, active_state),
+                                     *state_atv,
+                                     active_state),
                           *state_grad);
         *state_grad =
             _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
       }
     }
-    *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
-                          active_node);
-    *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
-                          active_gate);
-    *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg,
-                          active_gate);
+    *grad_in = activation(
+        _mm256_mul_ps(*state_grad, *value_ig), *value_in, active_node);
+    *grad_ig = activation(
+        _mm256_mul_ps(*state_grad, *value_in), *value_ig, active_gate);
+    *grad_fg = activation(
+        _mm256_mul_ps(*state_grad, *prev_state), *value_fg, active_gate);
     *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI),
                                      _mm256_mul_ps(*grad_fg, *checkF));
     *prev_state_grad =
@@ -183,6 +229,5 @@ class lstm {
 }  // namespace backward
 
 }  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
new file mode 100644
index 00000000000..4f159fd28af
--- /dev/null
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/gru_compute.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size * 2,
+                frame_size,
+                1,
+                value.prev_out_value,
+                frame_size,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                value.gate_value,
+                frame_size * 3);
+    }
+
+    detail::forward_reset_output(
+        phi::funcs::detail::forward::gru_resetOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_gate,
+        true,
+        nullptr);
+
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.reset_output_value,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                value.gate_value + frame_size * 2,
+                frame_size * 3);
+    }
+
+    detail::forward_final_output(
+        phi::funcs::detail::forward::gru_finalOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_node,
+        origin_mode,
+        true,
+        nullptr);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    detail::backward_state_grad(
+        phi::funcs::detail::backward::gru_stateGrad<T>(),
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        active_node,
+        origin_mode);
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value && grad.prev_out_grad) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size * 2,
+                frame_size * 3,
+                value.state_weight,
+                frame_size,
+                0,
+                grad.reset_output_grad,
+                frame_size);
+
+      if (grad.state_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  value.reset_output_value,
+                  frame_size,
+                  grad.gate_grad + frame_size * 2,
+                  frame_size * 3,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+
+    detail::backward_reset_grad(
+        phi::funcs::detail::backward::gru_resetGrad<T>(),
+        value,
+        grad,
+        frame_size,
+        batch_size,
+        active_gate);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size * 2,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+
+      if (grad.gate_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size * 2,
+                  batch_size,
+                  1,
+                  value.prev_out_value,
+                  frame_size,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size * 2);
+      }
+    }
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(CblasNoTrans,
+                CblasTrans,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.prev_out_value,
+                value.state_weight,
+                0,
+                value.reset_output_value);
+    }
+    detail::forward_reset_output(
+        phi::funcs::detail::forward::gru_resetOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_gate,
+        false,
+        &context);
+
+    T *cell_state_value = value.gate_value + 2 * frame_size;
+    T *reset_output_value = value.reset_output_value;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(
+          frame_size, cell_state_value, reset_output_value, cell_state_value);
+      cell_state_value += frame_size * 3;
+      reset_output_value += frame_size;
+    }
+
+    detail::forward_final_output(
+        phi::funcs::detail::forward::gru_finalOutput<T>(),
+        value,
+        frame_size,
+        batch_size,
+        active_node,
+        true,
+        false,
+        &context);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate) {
+#if !defined(__NVCC__) && !defined(__HIPCC___)
+    // calculate grad_update_gate, grad_frame_state,
+    // grad_reset_output, grad_reset_gate
+    detail::cpu_gru_backward(context,
+                             phi::funcs::detail::backward::gru<T>(),
+                             value,
+                             grad,
+                             frame_size,
+                             batch_size,
+                             active_node,
+                             active_gate);
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      // update prev_out_grad
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size,
+                frame_size * 3,
+                value.gate_weight + frame_size * frame_size,
+                frame_size,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.reset_output_grad,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+      // update weight_hh_grad
+      if (grad.gate_weight_grad) {
+        // reset gate
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  value.prev_out_value,
+                  frame_size,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size);
+        // update gate
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  grad.gate_grad + frame_size,
+                  frame_size * 3,
+                  value.prev_out_value,
+                  frame_size,
+                  1,
+                  grad.gate_weight_grad + frame_size * frame_size,
+                  frame_size);
+        // cell state
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  grad.reset_output_grad,
+                  frame_size,
+                  value.prev_out_value,
+                  frame_size,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+    // update bias_hh_grad
+    T *gate_grad = grad.gate_grad;
+    T *bias_hh_grad = grad.bias_hh_grad;
+    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
+    T *reset_output_grad = grad.reset_output_grad;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
+      blas.VADD(
+          frame_size, state_bias_grad, reset_output_grad, state_bias_grad);
+      gate_grad += 3 * frame_size;
+      reset_output_grad += frame_size;
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
+
+template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext,
+                                     double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
new file mode 100644
index 00000000000..7666206b7f7
--- /dev/null
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
@@ -0,0 +1,349 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/fluid/platform/device_context.h>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+#include "paddle/phi/kernels/funcs/gru_compute.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      if (context.GetComputeCapability() >= 70) {
+        if (frame_size < 16) {
+          constexpr int tiled_size = 8;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        } else {
+          constexpr int tiled_size = 16;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T,
+              tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        }
+        return;
+      } else {
+        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+        int frame_blocks = (frame_size + 1024 - 1) / 1024;
+        threads = dim3(frame_per_block, 1);
+        grid = dim3(frame_blocks, 1);
+      }
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size * 2,
+                frame_size,
+                1,
+                value.prev_out_value,
+                frame_size,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                value.gate_value,
+                frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardResetOutput<
+          phi::funcs::detail::forward::gru_resetOutput<T>,
+          /* is_batch= */ false,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_resetOutput<T>(),
+          value.gate_value,
+          value.reset_output_value,
+          value.prev_out_value,
+          frame_size,
+          batch_size,
+          active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<
+          phi::funcs::detail::forward::gru_resetOutput<T>,
+          /* is_batch= */ true,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_resetOutput<T>(),
+          value.gate_value,
+          value.reset_output_value,
+          value.prev_out_value,
+          frame_size,
+          batch_size,
+          active_gate);
+    }
+
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.reset_output_value,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                value.gate_value + frame_size * 2,
+                frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardFinalOutput<
+          phi::funcs::detail::forward::gru_finalOutput<T>,
+          /* is_batch= */ false,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_finalOutput<T>(),
+          value.gate_value,
+          value.prev_out_value,
+          value.output_value,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    } else {
+      detail::KeGruForwardFinalOutput<
+          phi::funcs::detail::forward::gru_finalOutput<T>,
+          /* is_batch= */ true,
+          T><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::forward::gru_finalOutput<T>(),
+          value.gate_value,
+          value.prev_out_value,
+          value.output_value,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardStateGrad<
+          phi::funcs::detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_stateGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.output_grad,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          phi::funcs::detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_stateGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.output_grad,
+          frame_size,
+          batch_size,
+          active_node,
+          origin_mode);
+    }
+
+    auto blas =
+        phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size * 2,
+                frame_size * 3,
+                value.state_weight,
+                frame_size,
+                0,
+                grad.reset_output_grad,
+                frame_size);
+
+      if (grad.state_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  value.reset_output_value,
+                  frame_size,
+                  grad.gate_grad + frame_size * 2,
+                  frame_size * 3,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardResetGrad<
+          phi::funcs::detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_resetGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.reset_output_grad,
+          frame_size,
+          batch_size,
+          active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          phi::funcs::detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          phi::funcs::detail::backward::gru_resetGrad<T>(),
+          value.gate_value,
+          grad.gate_grad,
+          value.prev_out_value,
+          grad.prev_out_grad,
+          grad.reset_output_grad,
+          frame_size,
+          batch_size,
+          active_gate);
+    }
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size * 2,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+
+      if (grad.gate_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size * 2,
+                  batch_size,
+                  1,
+                  value.prev_out_value,
+                  frame_size,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.h b/paddle/phi/kernels/funcs/gru_compute.h
new file mode 100644
index 00000000000..02b2b91423c
--- /dev/null
+++ b/paddle/phi/kernels/funcs/gru_compute.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct GRUMetaValue {
+  const T *gate_weight;
+  const T *state_weight;
+  const T *reset_bias;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  const T *prev_out_value;
+};
+
+template <typename T>
+struct GRUMetaGrad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
+  T *bias_hh_grad;
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitFunctor {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate,
+                      bool origin_mode);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitFunctorV2 {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctorV2 {
+  static void compute(const DeviceContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const phi::funcs::detail::ActivationType active_node,
+                      const phi::funcs::detail::ActivationType active_gate);
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cc b/paddle/phi/kernels/funcs/lstm_compute.cc
new file mode 100644
index 00000000000..19932c62b01
--- /dev/null
+++ b/paddle/phi/kernels/funcs/lstm_compute.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/lstm_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <class T>
+struct LstmUnitFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(context,
+                               phi::funcs::detail::forward::lstm<T>(),
+                               value,
+                               frame_size,
+                               cell_clip,
+                               cand_act,
+                               gate_act,
+                               cell_act,
+                               old_api_version);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
+  static void compute(const paddle::platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(context,
+                                phi::funcs::detail::backward::lstm<T>(),
+                                value,
+                                grad,
+                                frame_size,
+                                cell_clip,
+                                cand_act,
+                                gate_act,
+                                cell_act,
+                                old_api_version);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
+      }
+    }
+  }
+};
+
+template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cu b/paddle/phi/kernels/funcs/lstm_compute.cu
new file mode 100644
index 00000000000..b2057cfc4f9
--- /dev/null
+++ b/paddle/phi/kernels/funcs/lstm_compute.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h"
+#include "paddle/phi/kernels/funcs/detail/lstm_kernel.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+
+namespace phi {
+namespace funcs {
+
+template <class T>
+struct LstmUnitFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    detail::gpu_lstm_forward<T>(context,
+                                phi::funcs::detail::forward::lstm<T>(),
+                                value,
+                                frame_size,
+                                batch_size,
+                                cell_clip,
+                                cand_act,
+                                gate_act,
+                                cell_act);
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
+  static void compute(const paddle::platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType& gate_act,
+                      const phi::funcs::detail::ActivationType& cell_act,
+                      const phi::funcs::detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
+    detail::gpu_lstm_backward(context,
+                              phi::funcs::detail::backward::lstm<T>(),
+                              value,
+                              grad,
+                              frame_size,
+                              batch_size,
+                              cell_clip,
+                              cand_act,
+                              gate_act,
+                              cell_act);
+  }
+};
+
+template class LstmUnitFunctor<paddle::platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<paddle::platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/phi/kernels/funcs/lstm_compute.h
similarity index 56%
rename from paddle/fluid/operators/math/lstm_compute.h
rename to paddle/phi/kernels/funcs/lstm_compute.h
index cc91f784f39..d51b92fc4fd 100644
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/phi/kernels/funcs/lstm_compute.h
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 template <class T>
 struct LstmMetaValue {
@@ -49,25 +48,31 @@ struct LstmMetaGrad {
 template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act,
+  static void compute(const DeviceContext &context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType &gate_act,
+                      const phi::funcs::detail::ActivationType &cell_act,
+                      const phi::funcs::detail::ActivationType &cand_act,
                       bool old_api_version = true);
 };
 
 template <typename DeviceContext, typename T>
 class LstmUnitGradFunctor {
  public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act,
+  static void compute(const DeviceContext &context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const phi::funcs::detail::ActivationType &gate_act,
+                      const phi::funcs::detail::ActivationType &cell_act,
+                      const phi::funcs::detail::ActivationType &cand_act,
                       bool old_api_version = true);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc
similarity index 56%
rename from paddle/fluid/operators/math/sequence2batch.cc
rename to paddle/phi/kernels/funcs/sequence2batch.cc
index 852700fa7ff..0d75ba877db 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
@@ -12,47 +12,45 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
+class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Vector<size_t> index_lod,
+                  paddle::framework::Tensor* dst,
                   bool is_src_index) {
     size_t* index = index_lod.data();
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(src_dims.size(),
+                      2UL,
+                      phi::errors::InvalidArgument(
                           "The source tensor must be a matrix with rank 2, but "
                           "got the source tensor rank is %lu. "
                           "Please check the rank of the source tensor",
                           src_dims.size()));
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(dst_dims.size(),
+                      2UL,
+                      phi::errors::InvalidArgument(
                           "The destination tensor must be a matrix with rank, "
                           "but got the destination tensor rank is %lu. "
                           "Please check the rank of the destination tensor",
                           dst_dims.size()));
     PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1],
-        platform::errors::InvalidArgument(
+        src_dims[1],
+        dst_dims[1],
+        phi::errors::InvalidArgument(
             "The width of the source tensor and the destination tensor must be "
             "same. But got %lu != %lu.Please check the rank of the source "
             "tensor",
-            src_dims.size(), dst_dims.size()));
+            src_dims.size(),
+            dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
@@ -70,14 +68,18 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
+template class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext,
+                                     double>;
 
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
+template class LoDTensor2BatchFunctor<paddle::platform::CPUDeviceContext,
+                                      float>;
+template class LoDTensor2BatchFunctor<paddle::platform::CPUDeviceContext,
+                                      double>;
+template class Batch2LoDTensorFunctor<paddle::platform::CPUDeviceContext,
+                                      float>;
+template class Batch2LoDTensorFunctor<paddle::platform::CPUDeviceContext,
+                                      double>;
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu
similarity index 55%
rename from paddle/fluid/operators/math/sequence2batch.cu
rename to paddle/phi/kernels/funcs/sequence2batch.cu
index f56c5293971..a66030e6426 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/phi/kernels/funcs/sequence2batch.cu
@@ -11,15 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
-                                     int64_t height, int64_t width,
+__global__ void CopyMatrixRowsKernel(const T* src,
+                                     T* dst,
+                                     const size_t* index,
+                                     int64_t height,
+                                     int64_t width,
                                      bool is_src_index) {
   int idx = threadIdx.x;
   int idy = threadIdx.y;
@@ -37,33 +39,38 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
 }
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
+class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Vector<size_t> index_lod,
+                  paddle::framework::Tensor* dst,
                   bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(src_dims.size(),
+                      2,
+                      phi::errors::InvalidArgument(
                           "The source tensor must be a matrix with rank 2, but "
                           "got the source tensor rank is %lu. "
                           "Please check the rank of the source tensor",
                           src_dims.size()));
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(dst_dims.size(),
+                      2,
+                      phi::errors::InvalidArgument(
                           "The destination tensor must be a matrix with rank, "
                           "but got the destination tensor rank is %lu. "
                           "Please check the rank of the destination tensor",
                           dst_dims.size()));
     PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1],
-        platform::errors::InvalidArgument(
+        src_dims[1],
+        dst_dims[1],
+        phi::errors::InvalidArgument(
             "The width of the source tensor and the destination tensor must be "
             "same. But got %lu != %lu.Please check the rank of the source "
             "tensor",
-            src_dims.size(), dst_dims.size()));
+            src_dims.size(),
+            dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
@@ -74,19 +81,28 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto stream = context.stream();
     paddle::framework::MixVector<size_t> mix_index_lod(&index_lod);
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height,
-        width, is_src_index);
+        src_data,
+        dst_data,
+        mix_index_lod.CUDAData(context.GetPlace()),
+        height,
+        width,
+        is_src_index);
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
+template class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext,
+                                     float>;
+template class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext,
+                                     double>;
 
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
+template class LoDTensor2BatchFunctor<paddle::platform::CUDADeviceContext,
+                                      float>;
+template class LoDTensor2BatchFunctor<paddle::platform::CUDADeviceContext,
+                                      double>;
+template class Batch2LoDTensorFunctor<paddle::platform::CUDADeviceContext,
+                                      float>;
+template class Batch2LoDTensorFunctor<paddle::platform::CUDADeviceContext,
+                                      double>;
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/phi/kernels/funcs/sequence2batch.h
similarity index 80%
rename from paddle/fluid/operators/math/sequence2batch.h
rename to paddle/phi/kernels/funcs/sequence2batch.h
index 6aa513e4d10..e7c387fb99b 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/phi/kernels/funcs/sequence2batch.h
@@ -20,13 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
-template <typename T, int MajorType = Eigen::RowMajor,
+template <typename T,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using EigenMatrix = paddle::framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class CopyMatrixRowsFunctor {
@@ -36,8 +36,10 @@ class CopyMatrixRowsFunctor {
   // If is_src_index is false,
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
-  void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Vector<size_t> index_lod,
+                  paddle::framework::Tensor* dst,
                   bool is_src_index);
 };
 
@@ -59,32 +61,37 @@ class LoDTensor2BatchFunctor {
 
  public:
   void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor* batch, bool is_cal_batch_lod,
+                  const paddle::framework::LoDTensor& lod_tensor,
+                  paddle::framework::LoDTensor* batch,
+                  bool is_cal_batch_lod,
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
       PADDLE_ENFORCE_GT(
-          lods.size(), 2UL,
-          platform::errors::InvalidArgument(
+          lods.size(),
+          2UL,
+          phi::errors::InvalidArgument(
               "The LoD of LoDTensor should inlcude at least 2-level "
               "sequence information, but got the LoD level is %lu. Please "
               "check the input value.",
               lods.size()));
       PADDLE_ENFORCE_EQ(
-          lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          platform::errors::InvalidArgument(
+          lods[1].size(),
+          static_cast<size_t>(lod_tensor.dims()[0]),
+          phi::errors::InvalidArgument(
               "The LoD information should be consistent with the dims, but got "
               "%lu != %lu. Please check the input value.",
-              lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0])));
+              lods[1].size(),
+              static_cast<size_t>(lod_tensor.dims()[0])));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(lods.size(),
+                      1UL,
+                      phi::errors::InvalidArgument(
                           "Only support one level sequence now, but got the "
                           "LoD level is %lu. Please check the input value.",
                           lods.size()));
@@ -97,8 +104,9 @@ class LoDTensor2BatchFunctor {
       seq_info.emplace_back(lod[seq_id], length, seq_id);
     }
 
-    std::sort(seq_info.begin(), seq_info.end(),
-              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
+      return a.length > b.length;
+    });
 
     // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
@@ -169,27 +177,29 @@ template <typename DeviceContext, typename T>
 class Batch2LoDTensorFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& batch,
-                  framework::LoDTensor* lod_tensor) const {
+                  const paddle::framework::LoDTensor& batch,
+                  paddle::framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
     PADDLE_ENFORCE_GT(
-        in_lod.size(), 2UL,
-        platform::errors::InvalidArgument(
+        in_lod.size(),
+        2UL,
+        phi::errors::InvalidArgument(
             "The LoD of LoDTensor should inlcude at least 2-level "
             "sequence information, but got the LoD level is %lu. Please check "
             "the input value.",
             in_lod.size()));
     PADDLE_ENFORCE_EQ(
-        in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        platform::errors::InvalidArgument(
+        in_lod[1].size(),
+        static_cast<size_t>(lod_tensor->dims()[0]),
+        phi::errors::InvalidArgument(
             "The LoD information should be consistent with the dims, but got "
             "%lu != %lu. Please check the input value.",
-            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0])));
+            in_lod[1].size(),
+            static_cast<size_t>(lod_tensor->dims()[0])));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
-- 
GitLab


From 1ff1c1e09a835123fdfe48cc7660f0d190c64e1e Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Wed, 2 Mar 2022 10:44:15 +0800
Subject: [PATCH 036/272] add share external data interface (#39809)

---
 .../api/analysis_predictor_tester.cc          | 82 +++++++++++++++++
 .../inference/api/details/zero_copy_tensor.cc | 87 +++++++++++++++++++
 paddle/fluid/inference/api/paddle_tensor.h    | 13 +++
 3 files changed, 182 insertions(+)

diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index a15a1cd84b1..9c7e5c6b27e 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#if defined(PADDLE_WITH_CUDA)
+#include <cuda_runtime.h>
+#endif
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
@@ -405,4 +408,83 @@ TEST(Predictor, Run) {
   predictor->TryShrinkMemory();
 }
 
+TEST(Tensor, CpuShareExternalData) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  std::vector<std::vector<int64_t>> input_data(4, {0, 1, 2, 3});
+  w0->ShareExternalData<int64_t>(input_data[0].data(), {4, 1}, PlaceType::kCPU);
+  w1->ShareExternalData<int64_t>(input_data[1].data(), {4, 1}, PlaceType::kCPU);
+  w2->ShareExternalData<int64_t>(input_data[2].data(), {4, 1}, PlaceType::kCPU);
+  w3->ShareExternalData<int64_t>(input_data[3].data(), {4, 1}, PlaceType::kCPU);
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  auto out_shape = out->shape();
+  std::vector<float> out_data;
+  out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1,
+                                  std::multiplies<int>()));
+  out->ShareExternalData<float>(out_data.data(), out_shape, PlaceType::kCPU);
+
+  predictor->Run();
+
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+
+#if defined(PADDLE_WITH_CUDA)
+TEST(Tensor, GpuShareExternalData) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableUseGpu(100, 0);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  std::vector<std::vector<int64_t>> input_data(4, {0, 1, 2, 3});
+  std::vector<int64_t*> input_gpu(4, nullptr);
+
+  for (size_t i = 0; i < 4; ++i) {
+    cudaMalloc(reinterpret_cast<void**>(&input_gpu[i]), 4 * sizeof(int64_t));
+    cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t),
+               cudaMemcpyHostToDevice);
+  }
+
+  w0->ShareExternalData<int64_t>(input_gpu[0], {4, 1}, PlaceType::kGPU);
+  w1->ShareExternalData<int64_t>(input_gpu[1], {4, 1}, PlaceType::kGPU);
+  w2->ShareExternalData<int64_t>(input_gpu[2], {4, 1}, PlaceType::kGPU);
+  w3->ShareExternalData<int64_t>(input_gpu[3], {4, 1}, PlaceType::kGPU);
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  auto out_shape = out->shape();
+  float* out_data;
+  auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1,
+                                  std::multiplies<int>()) *
+                  sizeof(float);
+  cudaMalloc(reinterpret_cast<void**>(out_data), out_size * sizeof(float));
+  out->ShareExternalData<float>(out_data, out_shape, PlaceType::kGPU);
+
+  predictor->Run();
+
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+#endif
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 1d09b01f8f8..18b1d09f0e8 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/allocator.h"
 
 namespace paddle_infer {
 
@@ -205,6 +206,73 @@ void Tensor::CopyFromCpu(const T *data) {
   }
 }
 
+template <typename T>
+struct DataTypeInfo;
+
+template <>
+struct DataTypeInfo<float> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT32;
+};
+
+template <>
+struct DataTypeInfo<float16> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT16;
+};
+
+template <>
+struct DataTypeInfo<int64_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT64;
+};
+
+template <>
+struct DataTypeInfo<int8_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT8;
+};
+
+template <>
+struct DataTypeInfo<uint8_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::UINT8;
+};
+
+template <>
+struct DataTypeInfo<int32_t> {
+  paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT32;
+};
+
+paddle::experimental::DataLayout LayoutConvert(DataLayout layout) {
+  PADDLE_ENFORCE_EQ(
+      layout, DataLayout::kNCHW,
+      paddle::platform::errors::InvalidArgument("Only NCHW is supported now."));
+  return paddle::experimental::DataLayout::NCHW;
+}
+
+template <typename T>
+void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape,
+                               PlaceType place, DataLayout layout) {
+  EAGER_GET_TENSOR(paddle::framework::LoDTensor)
+  size_t size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  phi::DenseTensorMeta meta(DataTypeInfo<T>().TYPE, phi::make_ddim(shape),
+                            LayoutConvert(layout));
+  if (place == PlaceType::kCPU) {
+    phi::DenseTensor dtensor(
+        std::make_shared<phi::Allocation>(const_cast<T *>(data), size,
+                                          paddle::platform::CPUPlace()),
+        meta);
+    *tensor = std::move(dtensor);
+  } else if (place == PlaceType::kGPU) {
+    phi::DenseTensor dtensor(
+        std::make_shared<phi::Allocation>(const_cast<T *>(data), size,
+                                          paddle::platform::CUDAPlace(device_)),
+        meta);
+    *tensor = std::move(dtensor);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "PlaceType must be PlaceType::kCPU or PlaceType::kGPU."));
+  }
+}
+
 void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
   EAGER_GET_TENSOR(paddle_infer::Strings);
   PADDLE_ENFORCE_GE(tensor->size(), 0,
@@ -334,6 +402,25 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
 
+template PD_INFER_DECL void Tensor::ShareExternalData<float>(
+    const float *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<int64_t>(
+    const int64_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<int32_t>(
+    const int32_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<uint8_t>(
+    const uint8_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<int8_t>(
+    const int8_t *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
+    const float16 *data, const std::vector<int> &shape, PlaceType place,
+    DataLayout layout);
+
 template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 81eecbb2c14..5a98d109aed 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -47,6 +47,8 @@ enum DataType {
 
 enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };
 
+enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW };
+
 /// \brief Represents an n-dimensional array of values.
 /// The Tensor is used to store the input or output of the network.
 /// Zero copy means that the tensor supports direct copy of host or device data
@@ -92,6 +94,17 @@ class PD_INFER_DECL Tensor {
   template <typename T>
   void CopyFromCpu(const T* data);
 
+  /// \brief Share the data with tensor data.
+  /// It's usually used to set the tensor data.
+  /// \param data The pointer of the data, from which the tensor will share.
+  /// \param shape The shape of data.
+  /// \param place The place of data.
+  /// \param layout The layout of data. Only NCHW is supported now.
+  template <typename T>
+  void ShareExternalData(const T* data, const std::vector<int>& shape,
+                         PlaceType place,
+                         DataLayout layout = DataLayout::kNCHW);
+
   /// \brief Experimental interface.
   /// It's usually used to set the input tensor data with Strings data type.
   /// \param data The pointer of the data, from which the tensor will copy.
-- 
GitLab


From 26e2b918d80bb60855b9d1f8c0251d81e7c9e569 Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Wed, 2 Mar 2022 11:14:04 +0800
Subject: [PATCH 037/272] ernie: revert skip_layernorm_fp16 (#39991)

---
 paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 71c4348685e..753cd707276 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -105,7 +105,7 @@ class SkipLayerNormOpConverter : public OpConverter {
                               "in CustomSkipLayerNormPluginDynamic hidden "
                               "dimension should > 0"));
         if (enable_int8) {
-          type = static_cast<int>(nvinfer1::DataType::kINT8);
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
         }
 
         const std::vector<nvinfer1::PluginField> fields{
-- 
GitLab


From 9af72957520e4dffa6356bc637e0532bd799ab75 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Wed, 2 Mar 2022 11:14:25 +0800
Subject: [PATCH 038/272] [Eager] open eager when WITH_PYTHON (#39979)

* open eager when WITH_PYTHON, test=develop

* refine, test=develop

* refine, test=develop

* add DWITH_PYTHON for gen_fluid_lib, test=develop
---
 paddle/fluid/eager/CMakeLists.txt                  |  2 +-
 paddle/fluid/eager/api/generated/CMakeLists.txt    |  2 +-
 .../eager_generated/backwards/CMakeLists.txt       |  2 +-
 .../eager_generated/forwards/CMakeLists.txt        |  2 +-
 paddle/fluid/eager/tests/CMakeLists.txt            |  2 +-
 paddle/fluid/eager/tests/task_tests/CMakeLists.txt |  2 +-
 paddle/fluid/pybind/CMakeLists.txt                 |  8 ++++----
 paddle/fluid/pybind/pybind.cc                      |  8 ++------
 paddle/scripts/paddle_build.sh                     | 14 ++++++++++----
 python/paddle/fluid/tests/unittests/CMakeLists.txt |  2 +-
 10 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 5e16ab2b391..8cb69caf663 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -2,7 +2,7 @@ set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward p
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     message("Performing Eager Dygraph Auto Code Generation")
     add_subdirectory(auto_code_generator)
 endif()
diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt
index ebbef286f79..4f634c6884b 100644
--- a/paddle/fluid/eager/api/generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(eager_generated)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_subdirectory(fluid_generated)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index 77d8ec57efc..81ff07b8963 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info)
 
-if(NOT ON_INFER)
+if(NOT (NOT WITH_PYTHON AND ON_INFER))
 cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
 add_dependencies(final_dygraph_node eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 60b35340eab..c70bb80c35c 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node)
 
-if(NOT ON_INFER)
+if(NOT (NOT WITH_PYTHON AND ON_INFER))
 cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
 add_dependencies(final_dygraph_function eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt
index c1506d8139b..2bfb9937c8c 100644
--- a/paddle/fluid/eager/tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(data_structure_tests)
 add_subdirectory(task_tests)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_subdirectory(performance_tests)
 endif()
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index dbdb52eb536..c65ad4641cf 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
-if(NOT ON_INFER)
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
     cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
 endif()
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index c61e8212b02..48d42f803a8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -242,7 +242,7 @@ if(WITH_PYTHON)
       COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
       COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
       DEPENDS ${OP_IMPL_DEPS})
-    if(NOT ON_INFER)
+    if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
       add_custom_command(OUTPUT ${eager_impl_file}
         COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
         COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
@@ -276,7 +276,7 @@ if(WITH_PYTHON)
           COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
           DEPENDS ${OP_IMPL_DEPS}
           VERBATIM)
-    if(NOT ON_INFER)
+    if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
       add_custom_command(OUTPUT ${eager_impl_file}
             COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
                 "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
@@ -288,7 +288,7 @@ if(WITH_PYTHON)
       endif()
   endif(WIN32)
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
-  if(NOT ON_INFER)
+  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
   endif()
 
@@ -296,7 +296,7 @@ if(WITH_PYTHON)
   cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
   list(APPEND PYBIND_DEPS op_function_common)
 
-  if(NOT ON_INFER)
+  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
     DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3d8815e2eb6..2d9272dd0ed 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -79,12 +79,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/lod_utils.h"
-#ifndef PADDLE_ON_INFERENCE
 #include "paddle/fluid/pybind/eager.h"
-#endif
 #include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
@@ -529,9 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
-#ifndef PADDLE_ON_INFERENCE
   BindEager(&m);
-#endif
   BindCudaStream(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8528ba34e21..9bef7e12851 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2374,7 +2374,7 @@ EOF
     fi
     startTime_s=`date +%s`
     set +e
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto};build_error=$?
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON};build_error=$?
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
@@ -2739,7 +2739,9 @@ function main() {
         test_fluid_lib
         ;;
       build_inference_lib)
-        python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        if [ "${WITH_PYTHON}" == "OFF" ] ; then
+            python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        fi
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_lib ${parallel_number}
         ;;
@@ -2790,7 +2792,9 @@ function main() {
         ;;
       test_inference)
         PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-        python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        if [ "${WITH_PYTHON}" == "OFF" ] ; then
+            python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        fi
         gen_fluid_lib ${parallel_number}
         test_fluid_lib
         #test_fluid_lib_train
@@ -2800,7 +2804,9 @@ function main() {
         ;;
       build_inference)
         PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-        python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        if [ "${WITH_PYTHON}" == "OFF" ] ; then
+            python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py
+        fi
         gen_fluid_lib ${parallel_number}
         ;;
       gpu_inference)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 7d64cf7bd89..2f6df075478 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -116,7 +116,7 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
-if(ON_INFER)
+if(NOT WITH_PYTHON AND ON_INFER)
     LIST(REMOVE_ITEM TEST_OPS test_eager_trace_op)
 endif()
 
-- 
GitLab


From fb63508931868bd00d55af2abc34dfbd5c59915d Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Wed, 2 Mar 2022 11:15:10 +0800
Subject: [PATCH 039/272] optimize CUDA implementaion of randint OP (#39952)

* change CUDA implementaion of randint OP,move distribution common func to phi

* fix CI

* fix CI
---
 .../phi/kernels/funcs/distribution_helper.h   | 94 +++++++++++++++----
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |  4 +-
 paddle/phi/kernels/gpu/randint_kernel.cu      | 56 ++++++-----
 .../phi/kernels/gpu/uniform_random_kernel.cu  |  6 +-
 .../tests/unittests/test_cuda_random_seed.py  |  6 +-
 .../fluid/tests/unittests/test_randint_op.py  | 45 +++++++++
 6 files changed, 162 insertions(+), 49 deletions(-)

diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 49e1c82482c..f0793fb9d27 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -21,12 +21,11 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/generator.h"
-
-#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/core/hostdevice.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -40,7 +39,7 @@ limitations under the License. */
 #endif
 
 namespace phi {
-namespace distribution {
+namespace funcs {
 
 /********************* Transformation Function **********************/
 template <typename T>
@@ -64,8 +63,9 @@ struct exponential_transform {
 };
 
 template <typename T>
-struct uniform_transform {
-  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
+struct uniform_real_transform {
+  explicit uniform_real_transform(T min, T max)
+      : range_(max - min), min_(min) {}
 
   HOSTDEVICE inline T operator()(T val) const {
     if (UNLIKELY(val == static_cast<T>(1.0))) {
@@ -80,6 +80,22 @@ struct uniform_transform {
   T min_;
 };
 
+template <typename T, typename R>
+struct uniform_int_transform {
+  explicit uniform_int_transform(int min, int max) {
+    range_ = static_cast<uint32_t>(max - min);
+    min_ = min;
+  }
+
+  HOSTDEVICE inline T operator()(R rand) const {
+    return static_cast<T>(static_cast<int>(rand % range_) + min_);
+  }
+
+ private:
+  uint32_t range_;
+  int min_;
+};
+
 template <typename T>
 struct normal_transform {
   explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
@@ -120,6 +136,27 @@ struct uniform_distribution<double> {
   static constexpr int kReturnsCount = 2;
 };
 
+template <>
+struct uniform_distribution<uint32_t> {
+  __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<uint64_t> {
+  __device__ inline ulonglong2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    ulonglong2 result;
+    uint4 rand = curand4(state);
+    result.x = (uint64_t)rand.x << 32 | rand.y;
+    result.y = (uint64_t)rand.z << 32 | rand.w;
+    return result;
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
 template <>
 struct normal_distribution<float> {
   __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
@@ -156,6 +193,27 @@ struct uniform_distribution<double> {
   static constexpr int kReturnsCount = 2;
 };
 
+template <>
+struct uniform_distribution<uint32_t> {
+  __device__ inline uint4 operator()(hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<uint64_t> {
+  __device__ inline ulonglong2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    ulonglong2 result;
+    uint4 rand = hiprand4(state);
+    result.x = (uint64_t)rand.x << 32 | rand.y;
+    result.y = (uint64_t)rand.z << 32 | rand.w;
+    return result;
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
 template <>
 struct normal_distribution<float> {
   __device__ inline float4 operator()(
@@ -209,19 +267,21 @@ __global__ void DistributionKernel(size_t size,
 }
 
 template <typename T, typename DistOp, typename TransformOp>
-void distribution_and_transform(const GPUContext &dev_ctx,
+void distribution_and_transform(const GPUContext &ctx,
                                 DenseTensor *out,
                                 DistOp dist,
                                 TransformOp trans) {
-  T *out_data = dev_ctx.template Alloc<T>(out);
+  T *out_data = ctx.template Alloc<T>(out);
   auto size = out->numel();
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = dev_ctx.GetGenerator();
+  if (size == 0) return;
+  auto gen_cuda = ctx.GetGenerator();
 
   size_t block_size = 256;
   size_t expect_grid_size = (size + block_size - 1) / block_size;
-  const auto &prop = backends::gpu::GetDeviceProperties(device_id);
+
+  int64_t device_id = ctx.GetPlace().GetDeviceId();
+  const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
+
   size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
                          prop.multiProcessorCount;
   size_t grid_size =
@@ -237,13 +297,13 @@ void distribution_and_transform(const GPUContext &dev_ctx,
   uint64_t seed = seed_offset.first;
   uint64_t offset = seed_offset.second;
 
-  DistributionKernel<
-      T,
-      DistOp,
-      TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  DistributionKernel<T,
+                     DistOp,
+                     TransformOp><<<grid_size, block_size, 0, ctx.stream()>>>(
       size, seed, offset, dist, trans, out_data, total_thread);
 }
 
 #endif
-}  // namespace distribution
+
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index ac69d398b8a..2b6140d2fde 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -29,9 +29,9 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bernoulli_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/platform/transform.h"
 
 DECLARE_bool(use_curand);
@@ -77,7 +77,7 @@ __global__ void bernoulli_cuda_kernel(
 
   size_t total_thread = gridDim.x * blockDim.x;
   for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) {
-    paddle::distribution::uniform_distribution<float> dist;
+    funcs::uniform_distribution<float> dist;
     float4 rand = dist(&state);
 #pragma unroll
     for (size_t j = 0; j < 4; j++) {
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
index 66dc5f72a5c..d4cbd5c73fe 100644
--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -18,10 +18,13 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
 template <typename T, typename Context>
@@ -32,34 +35,39 @@ void RandintRawKernel(const Context& dev_ctx,
                       DataType dtype,
                       int seed,
                       DenseTensor* out) {
-  DenseTensor tmp;
-  tmp.Resize(phi::make_ddim(shape.GetData()));
-  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-  out->Resize(tmp.dims());
+  out->Resize(phi::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
+  if (FLAGS_use_curand) {
+    funcs::uniform_distribution<uint32_t> dist;
+    funcs::uniform_int_transform<T, uint32_t> trans(low, high);
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
-    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-  }
+    DenseTensor tmp;
+    tmp.Resize(phi::make_ddim(shape.GetData()));
+    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
 
-  std::uniform_int_distribution<T> dist(low, high - 1);
-  auto numel = out->numel();
-  for (int64_t i = 0; i < numel; ++i) {
-    tmp_data[i] = dist(*engine);
-  }
+    std::shared_ptr<std::mt19937_64> engine;
+    if (seed) {
+      engine = std::make_shared<std::mt19937_64>();
+      engine->seed(seed);
+    } else {
+      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+    }
+
+    std::uniform_int_distribution<T> dist(low, high - 1);
+    auto numel = out->numel();
+    for (int64_t i = 0; i < numel; ++i) {
+      tmp_data[i] = dist(*engine);
+    }
 
-  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-      out->place(),
-      data,
-      tmp.place(),
-      tmp_data,
-      numel * paddle::experimental::SizeOf(out->dtype()),
-      0);
+    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+        out->place(),
+        data,
+        tmp.place(),
+        tmp_data,
+        numel * paddle::experimental::SizeOf(out->dtype()),
+        0);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index 7f24a6667e5..cdab9faf6aa 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -116,9 +116,9 @@ void UniformRandomRawKernel(const Context& dev_ctx,
   if (generator->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
       using MT = typename kps::details::MPTypeTrait<T>::Type;
-      distribution::uniform_distribution<MT> dist;
-      distribution::uniform_transform<MT> trans(min, max);
-      distribution::distribution_and_transform<T>(dev_ctx, out, dist, trans);
+      funcs::uniform_distribution<MT> dist;
+      funcs::uniform_real_transform<MT> trans(min, max);
+      funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
     } else {
       auto seed_offset = generator->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 686e738b8e0..69760192102 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -93,11 +93,11 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.enable_dygraph()
 
-        gen = paddle.seed(12312321111)
+        paddle.seed(12312321111)
         x = paddle.randint(low=10, shape=[10], dtype="int32")
-        st1 = gen.get_state()
+        st1 = paddle.get_cuda_rng_state()
         x1 = paddle.randint(low=10, shape=[10], dtype="int32")
-        gen.set_state(st1)
+        paddle.set_cuda_rng_state(st1)
         x2 = paddle.randint(low=10, shape=[10], dtype="int32")
         paddle.seed(12312321111)
         x3 = paddle.randint(low=10, shape=[10], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 82bfb88d54d..5f58054d7ef 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -20,6 +20,9 @@ from op_test import OpTest
 import paddle
 from paddle.fluid import core
 from paddle.static import program_guard, Program
+import os
+
+paddle.enable_static()
 
 
 def output_hist(out):
@@ -156,5 +159,47 @@ class TestRandintImperative(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Different GPU generatte different random value. Only test V100 here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+
+        x = paddle.randint(
+            -10000, 10000, [32, 3, 1024, 1024], dtype='int32').numpy()
+        self.assertTrue(x.mean(), -0.7517569760481516)
+        self.assertTrue(x.std(), 5773.696619107639)
+        expect = [2535, 2109, 5916, -5011, -261]
+        self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect))
+        expect = [3465, 7206, -8660, -9628, -6574]
+        self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect))
+        expect = [881, 1560, 1100, 9664, 1669]
+        self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect))
+
+        x = paddle.randint(
+            -10000, 10000, [32, 3, 1024, 1024], dtype='int64').numpy()
+        self.assertTrue(x.mean(), -1.461287518342336)
+        self.assertTrue(x.std(), 5773.023477548159)
+        expect = [7213, -9597, 754, 8129, -1158]
+        self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect))
+        expect = [-7159, 8054, 7675, 6980, 8506]
+        self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect))
+        expect = [3581, 3420, -8027, -5237, -2436]
+        self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From aa47297a5cf94fcd56b8647332ee92f971565d86 Mon Sep 17 00:00:00 2001
From: lkylkylky <48178838+daidaiershidi@users.noreply.github.com>
Date: Wed, 2 Mar 2022 11:25:18 +0800
Subject: [PATCH 040/272] fix unittests for eignvalsh (#39841)

---
 .../fluid/tests/unittests/test_eigvalsh_op.py | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
index db023722676..93745d9561f 100644
--- a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
@@ -60,8 +60,12 @@ class TestEigvalshGPUCase(unittest.TestCase):
         self.dtype = "float32"
         np.random.seed(123)
         self.x_np = np.random.random(self.x_shape).astype(self.dtype)
-        self.rtol = 1e-5
-        self.atol = 1e-5
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
 
     def test_check_output_gpu(self):
         if paddle.is_compiled_with_cuda():
@@ -75,23 +79,29 @@ class TestEigvalshGPUCase(unittest.TestCase):
 
 class TestEigvalshAPI(unittest.TestCase):
     def setUp(self):
-        self.init_input_shape()
+        self.x_shape = [5, 5]
         self.dtype = "float32"
         self.UPLO = 'L'
-        self.rtol = 1e-6
-        self.atol = 1e-6
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
         self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
             else paddle.CPUPlace()
         np.random.seed(123)
+        self.init_input_data()
+
+    def init_input_data(self):
         self.real_data = np.random.random(self.x_shape).astype(self.dtype)
-        self.complex_data = np.random.random(self.x_shape).astype(
+        complex_data = np.random.random(self.x_shape).astype(
             self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype)
         self.trans_dims = list(range(len(self.x_shape) - 2)) + [
             len(self.x_shape) - 1, len(self.x_shape) - 2
         ]
-
-    def init_input_shape(self):
-        self.x_shape = [5, 5]
+        self.complex_symm = np.divide(
+            complex_data + np.conj(complex_data.transpose(self.trans_dims)), 2)
 
     def compare_result(self, actual_w, expected_w):
         np.testing.assert_allclose(
@@ -122,9 +132,9 @@ class TestEigvalshAPI(unittest.TestCase):
             output_w = paddle.linalg.eigvalsh(input_x)
             exe = paddle.static.Executor(self.place)
             expected_w = exe.run(main_prog,
-                                 feed={"input_x": self.complex_data},
+                                 feed={"input_x": self.complex_symm},
                                  fetch_list=[output_w])
-            actual_w = np.linalg.eigvalsh(self.complex_data)
+            actual_w = np.linalg.eigvalsh(self.complex_symm)
             self.compare_result(actual_w, expected_w[0])
 
     def test_in_static_mode(self):
@@ -139,14 +149,14 @@ class TestEigvalshAPI(unittest.TestCase):
         actual_w = paddle.linalg.eigvalsh(input_real_data)
         self.compare_result(actual_w, expected_w)
 
-        input_complex_data = paddle.to_tensor(self.complex_data)
-        expected_w = np.linalg.eigvalsh(self.complex_data)
-        actual_w = paddle.linalg.eigvalsh(input_complex_data)
+        input_complex_symm = paddle.to_tensor(self.complex_symm)
+        expected_w = np.linalg.eigvalsh(self.complex_symm)
+        actual_w = paddle.linalg.eigvalsh(input_complex_symm)
         self.compare_result(actual_w, expected_w)
 
     def test_eigvalsh_grad(self):
         paddle.disable_static(self.place)
-        x = paddle.to_tensor(self.complex_data, stop_gradient=False)
+        x = paddle.to_tensor(self.complex_symm, stop_gradient=False)
         w = paddle.linalg.eigvalsh(x)
         (w.sum()).backward()
         np.testing.assert_allclose(
-- 
GitLab


From 4e00d2bb338082dc9e3f1ee44b5887c930c8bb60 Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Wed, 2 Mar 2022 12:41:15 +0800
Subject: [PATCH 041/272] add_new_comm_primitive (#40040)

---
 .../distributed/collective/ProcessGroup.h     |  20 ++-
 .../collective/ProcessGroupNCCL.cc            | 156 ++++++++++++++++++
 .../distributed/collective/ProcessGroupNCCL.h |  17 ++
 paddle/fluid/distributed/collective/Types.h   |   4 +
 paddle/fluid/pybind/distributed_py.cc         |  33 ++++
 .../tests/unittests/process_group_nccl.py     |  30 ++++
 6 files changed, 259 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index dde8622d900..e4f27205202 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -96,7 +96,25 @@ class ProcessGroup {
       std::vector<Tensor>& /* tensors */,
       const BroadcastOptions& = BroadcastOptions()) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support allreduce", GetBackendName()));
+        "ProcessGroup%s does not support broadcast", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support barrier", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<Tensor>& tensors /* tensors */, int dst_rank) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<Tensor>& tensors /* tensors */, int src_rank) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support receive", GetBackendName()));
   }
 
  protected:
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index fe2325423b4..5d96e730aa4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
 
 DECLARE_bool(nccl_blocking_wait);
 DECLARE_bool(use_stream_safe_cuda_allocator);
@@ -139,6 +142,14 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
       std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
     }
   }
+
+  if (!barrierTensors_.empty()) {
+    // If we use the work to do barrier, we should block cpu
+    for (auto& place : places_) {
+      platform::CUDADeviceGuard gpuGuard(place);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+    }
+  }
   return true;
 }
 
@@ -193,6 +204,10 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   nccl_ids.resize(1);
   auto& nccl_id = nccl_ids.front();
 
+  for (auto& place : places) {
+    used_place_ids_.insert(place.GetDeviceId());
+  }
+
   if (rank_ == 0) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
   }
@@ -274,6 +289,54 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
   return task;
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
+    std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, tensors);
+
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      auto dense_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+      memory::RecordStream(dense_tensor->Holder(),
+                           places_to_ctx_[key][i]->stream());
+    }
+  }
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+    }
+  }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    cuda_guard.SetDevice(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
     std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
   PADDLE_ENFORCE_EQ(
@@ -317,5 +380,98 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CommType::BROADCAST);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
+    const BarrierOptions& opts) {
+  std::vector<phi::GPUPlace> places;
+
+  if (!opts.place_ids.empty()) {
+    for (auto place_id : opts.place_ids) {
+      places.emplace_back(place_id);
+    }
+  } else if (!used_place_ids_.empty()) {
+    for (auto place_id : used_place_ids_) {
+      places.emplace_back(place_id);
+    }
+  } else {
+    auto numGPUs = GetSize();
+    int place_id = static_cast<int>(rank_ % numGPUs);
+    places.emplace_back(place_id);
+  }
+
+  std::vector<Tensor> barrierTensors;
+  barrierTensors.reserve(places.size());
+
+  platform::CUDADeviceGuard gpuGuard;
+  for (auto& place : places) {
+    gpuGuard.SetDeviceIndex(place.GetDeviceId());
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
+    barrierTensors.push_back(dt);
+  }
+  auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
+  auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
+  nccl_task->barrierTensors_ = std::move(barrierTensors);
+  return task;
+}
+
+void CheckTensorsInDifferentDevices(const std::vector<Tensor>& tensors,
+                                    const size_t num_devices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size() == 0, false,
+      platform::errors::InvalidArgument("Tensor list must be nonempty."));
+  PADDLE_ENFORCE_LE(
+      tensors.size(), num_devices,
+      platform::errors::InvalidArgument(
+          "Tensor list mustn't be larger than the number of available GPUs."));
+
+  std::set<Place> used_devices;
+
+  for (const auto& t : tensors) {
+    PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true,
+                      platform::errors::InvalidArgument(
+                          "Tensors must be CUDA and dense tensor."));
+
+    const auto inserted = used_devices.insert(t.inner_place()).second;
+    PADDLE_ENFORCE_EQ(inserted, true,
+                      platform::errors::InvalidArgument(
+                          "Tensors must be on distinct GPU devices."));
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
+    std::vector<Tensor>& tensors, int dst_rank) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](Tensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        return platform::dynload::ncclSend(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
+    std::vector<Tensor>& tensors, int src_rank) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](Tensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclRecv(
+            output_tensor->data(), output_tensor->numel(),
+            platform::ToNCCLDataType(output.type()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
+  return task;
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 9f06566d1c8..cfeb6467f0d 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -65,6 +65,7 @@ class ProcessGroupNCCL : public ProcessGroup {
     virtual ~NCCLTask();
 
     std::vector<EventManager> control_events_;
+    std::vector<Tensor> barrierTensors_;
 
    protected:
     std::vector<Place> places_;
@@ -88,6 +89,15 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<Tensor>& tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
+                                           int dst_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
+                                           int src_rank) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
@@ -106,6 +116,8 @@ class ProcessGroupNCCL : public ProcessGroup {
                      std::vector<std::unique_ptr<CUDADeviceContext>>>
       places_to_ctx_;
 
+  std::set<int> used_place_ids_;
+
  private:
   void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root,  // NOLINT
                    int server_fd);
@@ -118,6 +130,11 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<Tensor>& outputs,  // NOLINT
       Fn fn, CommType op_type);
 
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<Tensor>& tensors,  // NOLINT
+      Fn fn, int dst_rank, CommType op_type);
+
   void CreateNCCLManagerCache(const std::string& places_key,
                               const std::vector<Place>& places);
 };
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
index 654d0668695..699222ac452 100644
--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -32,5 +32,9 @@ struct BroadcastOptions {
   int source_root = 0;
 };
 
+struct BarrierOptions {
+  std::vector<int> place_ids;
+};
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 7b59188a9f3..a4a1d07db2c 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -60,6 +60,10 @@ void BindDistributed(py::module *m) {
       .def_readwrite("source_root",
                      &distributed::BroadcastOptions::source_root);
 
+  py::class_<distributed::BarrierOptions>(*m, "BarrierOptions")
+      .def(py::init<>())
+      .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
+
   auto ProcessGroup =
       py::class_<distributed::ProcessGroup,
                  std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
@@ -88,6 +92,35 @@ void BindDistributed(py::module *m) {
                  return self.Broadcast(tensors, opts);
                },
                py::arg("tensor"), py::arg("source_rank"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("barrier",
+               [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
+                 distributed::BarrierOptions opts;
+                 opts.place_ids = place_ids;
+                 return self.Barrier(opts);
+               },
+               py::arg("place_ids") = std::vector<int>{},
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("send",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  int dst) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.Send(tensors, dst);
+               },
+               py::arg("tensor"), py::arg("dst"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("recv",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  int src) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.Recv(tensors, src);
+               },
+               py::arg("tensor"), py::arg("src"),
                py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL)
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index d999aad63ec..8ec5d13c569 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -132,6 +132,36 @@ class TestProcessGroupFp32(unittest.TestCase):
 
             print("test broadcast api ok")
 
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+
+            # test send/recv
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            if pg.rank() == 0:
+                task = pg.send(tensor_x, dst=1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                y = np.random.random(self.shape).astype(self.dtype)
+                tensor_y = paddle.to_tensor(y)
+                task = pg.recv(tensor_y, src=0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+                assert np.array_equal(tensor_x, tensor_y)
+                print("test send/recv api ok\n")
+
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
     def setUp(self):
-- 
GitLab


From 4cab812e04c4af2a67752e1da3de1d8acf7dba5c Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Wed, 2 Mar 2022 13:12:57 +0800
Subject: [PATCH 042/272] [MLU] add transpose2 mlu kernel (#39994)

---
 paddle/fluid/operators/mlu/mlu_baseop.h       |  13 +-
 .../operators/reduce_ops/reduce_max_op_mlu.cc |   4 +-
 .../operators/reduce_ops/reduce_min_op_mlu.cc |   4 +-
 .../softmax_with_cross_entropy_op_mlu.cc      |   6 +-
 paddle/fluid/operators/transpose_op_mlu.cc    |  74 ++++
 .../unittests/mlu/test_transpose_op_mlu.py    | 393 ++++++++++++++++++
 6 files changed, 482 insertions(+), 12 deletions(-)
 create mode 100644 paddle/fluid/operators/transpose_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py

diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 2cbecba9fa0..2a54a8392c7 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -1157,19 +1157,22 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
                                    const Tensor* transformed_input,
                                    Tensor* transformed_output,
                                    bool need_reshape_or_alloc) {
-  auto in_dims_vec = phi::vectorize(transformed_input->dims());
+  const int dim_size = perm.size();
   if (need_reshape_or_alloc) {
+    std::vector<int> output_shape;
+    auto input_dims = transformed_input->dims();
+    for (int i = 0; i < dim_size; ++i) {
+      output_shape.push_back(input_dims[perm[i]]);
+    }
     transformed_output->mutable_data<T>(
-        {in_dims_vec[perm[0]], in_dims_vec[perm[1]], in_dims_vec[perm[2]],
-         in_dims_vec[perm[3]]},
-        ctx.GetPlace());
+        framework::DDim(output_shape.data(), dim_size), ctx.GetPlace());
   }
   MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY,
                                   ToCnnlDataType<T>());
   MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY,
                                    ToCnnlDataType<T>());
 
-  MLUCnnl::Transpose(ctx, perm, in_dims_vec.size(), trans_in_desc.get(),
+  MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(),
                      GetBasePtr(transformed_input), trans_out_desc.get(),
                      GetBasePtr(transformed_output));
 }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
index 7e02f0268b5..1abec24c0d3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -27,11 +27,11 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
     int out_dtype = context.Attr<int>("out_dtype");
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = framework::vectorize(input->dims());
+    auto input_dims = input->dims();
     const auto& input_dim_size = input->dims().size();
     std::vector<int> reduce_dims;
     if (reduce_all) {
-      for (size_t i = 0; i < input_dims.size(); i++) {
+      for (int i = 0; i < input_dims.size(); i++) {
         reduce_dims.push_back(static_cast<int>(i));
       }
     } else {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
index daf5965fd54..d80cce74221 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
@@ -27,11 +27,11 @@ class ReduceMinMLUKernel : public framework::OpKernel<T> {
     int out_dtype = context.Attr<int>("out_dtype");
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = framework::vectorize(input->dims());
+    auto input_dims = input->dims();
     const auto& input_dim_size = input->dims().size();
     std::vector<int> reduce_dims;
     if (reduce_all) {
-      for (size_t i = 0; i < input_dims.size(); i++) {
+      for (int i = 0; i < input_dims.size(); i++) {
         reduce_dims.push_back(static_cast<int>(i));
       }
     } else {
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index 1cd6f8b7698..34650c2e062 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -37,7 +37,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
                           "the mlu kernel of softmax_with_cross_entropy."));
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
 
     loss->mutable_data<T>(ctx.GetPlace());
     backprop->mutable_data<T>(ctx.GetPlace());
@@ -45,10 +45,10 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
 
     // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
     const int cnnl_softmax_dims = 3;
-    const int d1 = SizeToAxis(axis, logits->dims());
+    const int d1 = phi::funcs::SizeToAxis(axis, logits->dims());
     const int d2_logits = logits->dims()[axis];
     const int d2_labels = labels->dims()[axis];
-    const int d3 = SizeOutAxis(axis, logits->dims());
+    const int d3 = phi::funcs::SizeOutAxis(axis, logits->dims());
 
     // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
     // possible.
diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc
new file mode 100644
index 00000000000..40cb22bab50
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_mlu.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class TransposeMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+
+    TransposeFromMLUTensor<T>(ctx, axis, x, out,
+                              false /*need_reshape_or_alloc*/);
+  }
+};
+
+template <typename T>
+class TransposeGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    TransposeFromMLUTensor<T>(ctx, reversed_axis, out_grad, x_grad,
+                              false /*need_reshape_or_alloc*/);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(transpose2, ops::TransposeMLUKernel<float>,
+                       ops::TransposeMLUKernel<paddle::platform::float16>,
+                       ops::TransposeMLUKernel<int>,
+                       ops::TransposeMLUKernel<int16_t>,
+                       ops::TransposeMLUKernel<uint8_t>,
+                       ops::TransposeMLUKernel<int8_t>,
+                       ops::TransposeMLUKernel<bool>);
+
+REGISTER_OP_MLU_KERNEL(transpose2_grad, ops::TransposeGradMLUKernel<float>,
+                       ops::TransposeGradMLUKernel<paddle::platform::float16>,
+                       ops::TransposeGradMLUKernel<int>,
+                       ops::TransposeGradMLUKernel<int16_t>,
+                       ops::TransposeGradMLUKernel<uint8_t>,
+                       ops::TransposeGradMLUKernel<int8_t>,
+                       ops::TransposeGradMLUKernel<bool>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
new file mode 100644
index 00000000000..6f1bda477f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
@@ -0,0 +1,393 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initKernelType()
+        self.initTestCase()
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        self.attrs = {'axis': list(self.axis), }
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
+
+    def initKernelType(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+
+class TestCase0(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.axis = (0, )
+
+
+class TestCase1(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+
+
+class TestCase2(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+class TestCase5(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 16, 96)
+        self.axis = (0, 2, 1)
+
+
+class TestCase6(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 10, 12, 16)
+        self.axis = (3, 1, 2, 0)
+
+
+class TestCase7(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 10, 2, 16)
+        self.axis = (0, 1, 3, 2)
+
+
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
+class TestTransposeOpBool(TestTransposeOp):
+    def test_check_grad(self):
+        pass
+
+
+class TestTransposeOpBool1D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.axis = (0, )
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool2D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool3D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool4D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool5D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool6D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool7D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpBool8D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+
+class TestTransposeOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(name='x', shape=[10, 5, 3], dtype='float32')
+
+            def test_x_Variable_check():
+                # the Input(x)'s type must be Variable
+                fluid.layers.transpose("not_variable", perm=[1, 0, 2])
+
+            self.assertRaises(TypeError, test_x_Variable_check)
+
+            def test_perm_list_check():
+                # Input(perm)'s type must be list
+                fluid.layers.transpose(x, perm="[1, 0, 2]")
+
+            self.assertRaises(TypeError, test_perm_list_check)
+
+            def test_perm_length_and_x_dim_check():
+                # Input(perm) is the permutation of dimensions of Input(input)
+                # its length should be equal to dimensions of Input(input)
+                fluid.layers.transpose(x, perm=[1, 0, 2, 3, 4])
+
+            self.assertRaises(ValueError, test_perm_length_and_x_dim_check)
+
+            def test_each_elem_value_check():
+                # Each element in Input(perm) should be less than Input(x)'s dimension
+                fluid.layers.transpose(x, perm=[3, 5, 7])
+
+            self.assertRaises(ValueError, test_each_elem_value_check)
+
+
+class TestTransposeApi(unittest.TestCase):
+    def test_static_out(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name='x', shape=[2, 3, 4], dtype='float32')
+            x_trans1 = paddle.transpose(x, perm=[1, 0, 2])
+            x_trans2 = paddle.transpose(x, perm=(2, 1, 0))
+            place = paddle.MLUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_np = np.random.random([2, 3, 4]).astype("float32")
+            result1, result2 = exe.run(feed={"x": x_np},
+                                       fetch_list=[x_trans1, x_trans2])
+            expected_result1 = np.transpose(x_np, [1, 0, 2])
+            expected_result2 = np.transpose(x_np, (2, 1, 0))
+
+            np.testing.assert_array_equal(result1, expected_result1)
+            np.testing.assert_array_equal(result2, expected_result2)
+
+    def test_dygraph_out(self):
+        # This is an old test before 2.0 API so we need to disable static
+        # to trigger dygraph
+        paddle.disable_static()
+        x = paddle.randn([2, 3, 4])
+        x_trans1 = paddle.transpose(x, perm=[1, 0, 2])
+        x_trans2 = paddle.transpose(x, perm=(2, 1, 0))
+        x_np = x.numpy()
+        expected_result1 = np.transpose(x_np, [1, 0, 2])
+        expected_result2 = np.transpose(x_np, (2, 1, 0))
+
+        np.testing.assert_array_equal(x_trans1.numpy(), expected_result1)
+        np.testing.assert_array_equal(x_trans2.numpy(), expected_result2)
+        # This is an old test before 2.0 API so we enable static again after
+        # dygraph test
+        paddle.enable_static()
+
+
+class TestTAPI(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[10], dtype="float32", name="data")
+            data_t = paddle.t(data)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            data_np = np.random.random([10]).astype("float32")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[10, 5], dtype="float32", name="data")
+            data_t = paddle.t(data)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            data_np = np.random.random([10, 5]).astype("float32")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[1, 5], dtype="float32", name="data")
+            data_t = paddle.t(data)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            data_np = np.random.random([1, 5]).astype("float32")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([10]).astype("float32")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([10, 5]).astype("float32")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([1, 5]).astype("float32")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name='x', shape=[10, 5, 3], dtype='float32')
+
+            def test_x_dimension_check():
+                paddle.t(x)
+
+            self.assertRaises(ValueError, test_x_dimension_check)
+
+
+class TestMoveAxis(unittest.TestCase):
+    def test_moveaxis1(self):
+        x_np = np.random.randn(2, 3, 4, 5, 7).astype('float32')
+        expected = np.moveaxis(x_np, [0, 4, 3, 2], [1, 3, 2, 0])
+        paddle.enable_static()
+        with paddle.static.program_guard(fluid.Program()):
+            x = paddle.static.data("x", shape=[2, 3, 4, 5, 7], dtype='float32')
+            out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0])
+
+            exe = paddle.static.Executor()
+            out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0]
+
+        self.assertEqual(np.array_equal(out_np, expected), True)
+
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np)
+        out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0])
+        self.assertEqual(out.shape, [4, 2, 5, 7, 3])
+        self.assertEqual(np.array_equal(out.numpy(), expected), True)
+        paddle.enable_static()
+
+    def test_moveaxis2(self):
+        x_np = np.random.randn(2, 3, 5).astype('float32')
+        expected = np.moveaxis(x_np, -2, -1)
+        paddle.enable_static()
+        with paddle.static.program_guard(fluid.Program()):
+            x = paddle.static.data("x", shape=[2, 3, 5], dtype='float32')
+            out = x.moveaxis(-2, -1)
+
+            exe = paddle.static.Executor()
+            out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0]
+
+        self.assertEqual(np.array_equal(out_np, expected), True)
+
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np)
+        out = x.moveaxis(-2, -1)
+        self.assertEqual(out.shape, [2, 5, 3])
+        self.assertEqual(np.array_equal(out.numpy(), expected), True)
+        paddle.enable_static()
+
+    def test_error(self):
+        x = paddle.randn([2, 3, 4, 5])
+        # src must have the same number with dst
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [1, 0], [2])
+
+        # each element of src must be unique
+        with self.assertRaises(ValueError):
+            paddle.moveaxis(x, [1, 1], [0, 2])
+
+        # each element of dst must be unique
+        with self.assertRaises(ValueError):
+            paddle.moveaxis(x, [0, 1], [2, 2])
+
+        # each element of src must be integer
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [0.5], [1])
+
+        # each element of dst must be integer
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [0], [1.5])
+
+        # each element of src must be in the range of [-4, 3)
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [-10, 1], [2, 3])
+
+        # each element of dst must be in the range of [-4, 3)
+        with self.assertRaises(AssertionError):
+            paddle.moveaxis(x, [2, 1], [10, 3])
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 36660d4c356d4c6b71eb8df51e094ea36bfa2c06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Wed, 2 Mar 2022 14:02:42 +0800
Subject: [PATCH 043/272] [infrt] speed up the infrt ci. test=devvelop (#40032)

---
 paddle/scripts/infrt_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 8d858647ea6..a0132501387 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -102,9 +102,11 @@ function infrt_gen_and_build() {
 
 function create_fake_models() {
     cd ${PADDLE_ROOT}/build
+    cd python/dist/
     # create multi_fc model, this will generate "multi_fc_model"
     python3 -m pip uninstall -y paddlepaddle
-    python3 -m pip install paddlepaddle
+    python3 -m pip install  *whl
+    cd ${PADDLE_ROOT}/build
     python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py
 }
 
-- 
GitLab


From 9070d5c5d85e15a04324b6a5f2f1e2c9a7ecc1b6 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Wed, 2 Mar 2022 14:08:19 +0800
Subject: [PATCH 044/272] test=document_fix;record py3 case time (#40018)

---
 paddle/scripts/paddle_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9bef7e12851..ed70a8638bf 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -776,7 +776,9 @@ set +x
             tmpfile=$tmp_dir/$tmpfile_rand
             ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
         fi
-
+        ut_total_endTime_s=`date +%s`
+        echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
+        
         collect_failed_tests
         rm -f $tmp_dir/*
         exec_times=0
-- 
GitLab


From b4d931e8bce97a12e9ac7a12ff6c0a11499002c7 Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Wed, 2 Mar 2022 14:23:35 +0800
Subject: [PATCH 045/272] [MLU] adapt matmul op (#39727)

* [MLU] adapt matmul op

* [MLU] fix phi namespace
---
 paddle/fluid/imperative/CMakeLists.txt        |   6 +-
 paddle/fluid/operators/matmul_op_mlu.cc       | 337 ++++++++++++++++++
 .../tests/unittests/mlu/test_matmul_op_mlu.py | 329 +++++++++++++++++
 3 files changed, 671 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/matmul_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py

diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index f198919b0c8..e1ce705533a 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -46,8 +46,12 @@ if(WITH_GLOO)
     endif()
 endif()
 
+if(WITH_MLU)
+    SET(MLU_DEPS mlu_baseop)
+endif()
+
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS})
 else()
 cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor)
 endif()
diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc
new file mode 100644
index 00000000000..d0c84c4751e
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
@@ -0,0 +1,337 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+static void Mul(const framework::ExecutionContext& ctx, const Tensor& X,
+                const Tensor& Y, Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                  CNNL_NOT_PROPAGATE_NAN);
+  MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(&X),
+                    y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                    GetBasePtr(Out), ToCnnlDataType<T>(), alpha);
+}
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits<float>::epsilon(),
+                    platform::errors::InvalidArgument(
+                        "MLU(matmul): alpha should be equal to 1.0! "
+                        "Other values are not supported yet."
+                        "But received alpha is %d.",
+                        alpha));
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnl::Matmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X),
+                  y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                  GetBasePtr(Out));
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  if (!Out->initialized()) {
+    Out->mutable_data<T>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits<float>::epsilon(),
+                    platform::errors::InvalidArgument(
+                        "MLU(matmul): alpha should be equal to 1.0! "
+                        "Other values are not supported yet."
+                        "But received alpha is %d.",
+                        alpha));
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnl::BatchMatmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X),
+                       y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                       GetBasePtr(Out));
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& bcast_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = bcast_dims.size();
+  int64_t diff = bcast_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (bcast_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  std::vector<int> reduce_dims(axes.begin(), axes.end());
+  MLUCnnlReduceDesc reduce_desc(reduce_dims, CNNL_REDUCE_ADD,
+                                ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN,
+                                CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+  MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduce_desc.get(), nullptr,
+                  in_desc.get(), GetBasePtr(&in), 0 /*indices_size*/, nullptr,
+                  nullptr, out_desc.get(), GetBasePtr(out));
+}
+
+template <typename T>
+class MatMulMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* Out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+
+    // Case 1: [K] x [K] = [1]
+    // Equal: [1, K] x [K, 1] = [1, 1] => [1]
+    const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
+    if (all_one_dim) {
+      Out->Resize({1, 1});
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      x_temp.Resize(phi::make_ddim(x_dims));
+      x_ndim = 2;
+      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
+      if (out_dims.size() < y_dims.size()) {
+        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
+        temp_out_dims.insert(temp_out_dims.end() - 1, 1);
+        Out->Resize(phi::make_ddim(temp_out_dims));
+      }
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      y_temp.Resize(phi::make_ddim(y_dims));
+      y_ndim = 2;
+      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
+      if (out_dims.size() < x_dims.size()) {
+        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
+        temp_out_dims.push_back(1);
+        Out->Resize(phi::make_ddim(temp_out_dims));
+      }
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (transpose_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
+    }
+
+    if (x_ndim == 2 && y_ndim == 2) {
+      // Case 2: [M, K] x [K, N] = [M, N]
+      MatMul2D<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
+    } else {
+      // Case 3: [B, M, K] x [K, N] =  [B, M, N]
+      // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+      MatMulND<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
+    }
+
+    if (phi::vectorize(Out->dims()) != out_dims) {
+      Out->Resize(phi::make_ddim(out_dims));
+    }
+  }
+};
+
+template <typename T>
+class MatMulGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
+
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dX) {
+        Mul<T>(ctx, *dOut, *Y, dX, alpha);
+      }
+      if (dY) {
+        Mul<T>(ctx, *dOut, *X, dY, alpha);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(phi::make_ddim(x_dims));
+      dout_temp.Resize(phi::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(phi::make_ddim(y_dims));
+      dout_temp.Resize(phi::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(phi::make_ddim(x_dims));
+        if (transpose_x) {
+          MatMul2D<T>(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha);
+        } else {
+          MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha);
+        }
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        dY->Resize(phi::make_ddim(y_dims));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha);
+        } else {
+          MatMul2D<T>(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha);
+        }
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N]
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_bcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_bcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
+
+    if (dX) {
+      Tensor dx_temp(X->type());
+      if (x_dims != x_bcast_dims) {
+        dx_temp.Resize(phi::make_ddim(x_bcast_dims));
+      } else {
+        dX->mutable_data<T>(ctx.GetPlace());
+        dx_temp.ShareDataWith(*dX);
+      }
+
+      if (transpose_x) {
+        MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha);
+      } else {
+        MatMulND<T>(ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y,
+                    alpha);
+      }
+
+      if (x_dims != x_bcast_dims) {
+        ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
+      }
+    }
+
+    if (dY) {
+      Tensor dy_temp(Y->type());
+      if (y_dims != y_bcast_dims) {
+        dy_temp.Resize(phi::make_ddim(y_bcast_dims));
+      } else {
+        dY->mutable_data<T>(ctx.GetPlace());
+        dy_temp.ShareDataWith(*dY);
+      }
+
+      if (transpose_y) {
+        MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha);
+      } else {
+        MatMulND<T>(ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false,
+                    alpha);
+      }
+
+      if (y_dims != y_bcast_dims) {
+        ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(matmul, ops::MatMulMLUKernel<float>,
+                       ops::MatMulMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(matmul_grad, ops::MatMulGradMLUKernel<float>,
+                       ops::MatMulGradMLUKernel<plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
new file mode 100644
index 00000000000..adfff112e6b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
@@ -0,0 +1,329 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2022
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "matmul"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y,
+                               self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y,
+            'alpha': self.alpha
+        }
+        self.outputs = {'Out': Out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+# TODO(mlu): alpha will be supported in next version
+#--------------------test matmul alpha--------------------
+# def create_test_alpha_class(parent):
+#     class TestMatMulOpAlphaCase(parent):
+#         def init_alpha(self):
+#             self.alpha = 0.125
+
+#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+#     TestMatMulOpAlphaCase.__name__ = cls_name
+#     globals()[cls_name] = TestMatMulOpAlphaCase
+
+# create_test_alpha_class(TestMatMulOp)
+# create_test_alpha_class(TestMatMulOp1)
+# create_test_alpha_class(TestMatMulOp2)
+# create_test_alpha_class(TestMatMulOp3)
+# create_test_alpha_class(TestMatMulOp4)
+# create_test_alpha_class(TestMatMulOp5)
+# create_test_alpha_class(TestMatMulOp6)
+# create_test_alpha_class(TestMatMulOp9)
+# create_test_alpha_class(TestMatMulOp10)
+# create_test_alpha_class(TestMatMulOp11)
+# create_test_alpha_class(TestMatMulOp12)
+# create_test_alpha_class(TestMatMulOp13)
+
+
+#--------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 0764fda25bb016bf143fc0a3aa93a3fb56b0cd73 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 2 Mar 2022 15:07:34 +0800
Subject: [PATCH 046/272] [Phi] Unify complex type trait and fix real imag bug
 (#40036)

* unify complex type trait and fix real imag bug

* add unittest for type tratis
---
 paddle/fluid/operators/angle_op.h             |   6 +-
 paddle/fluid/operators/eig_op.h               |  26 ++--
 paddle/fluid/operators/eigh_op.h              |   2 +-
 paddle/fluid/operators/eigvals_op.h           |  14 +-
 paddle/fluid/operators/imag_op.cc             |   2 +-
 paddle/fluid/operators/lstsq_op.h             |   4 +-
 .../operators/math/eigen_values_vectors.h     |   8 +-
 paddle/fluid/operators/math/inclusive_scan.h  |   2 +-
 paddle/fluid/operators/qr_op.cu               |  14 +-
 paddle/fluid/operators/qr_op.h                |  18 +--
 paddle/fluid/operators/real_op.cc             |   2 +-
 paddle/fluid/operators/svd_helper.h           |  12 +-
 paddle/fluid/operators/svd_op.h               |  12 +-
 paddle/phi/common/type_traits.h               |  96 ++++++++++++++
 paddle/phi/infermeta/unary.cc                 |   7 +
 paddle/phi/infermeta/unary.h                  |   2 +
 paddle/phi/kernels/cpu/abs_kernel.cc          |   6 +-
 paddle/phi/kernels/cpu/complex_kernel.cc      |   8 +-
 paddle/phi/kernels/funcs/complex_functors.h   | 123 ++++++------------
 paddle/phi/kernels/gpu/abs_kernel.cu          |  10 +-
 paddle/phi/kernels/gpu/complex_kernel.cu      |   8 +-
 .../phi/kernels/impl/abs_grad_kernel_impl.h   |   2 +-
 .../kernels/impl/complex_grad_kernel_impl.h   |   4 +-
 paddle/phi/kernels/impl/complex_kernel_impl.h |   8 +-
 paddle/phi/tests/common/test_data_type.cc     |  16 +++
 25 files changed, 247 insertions(+), 165 deletions(-)
 create mode 100644 paddle/phi/common/type_traits.h

diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
index db5a3ea2961..116a8053db3 100644
--- a/paddle/fluid/operators/angle_op.h
+++ b/paddle/fluid/operators/angle_op.h
@@ -36,8 +36,8 @@ class AngleKernel : public framework::OpKernel<T> {
 
     auto numel = x->numel();
     auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        context.GetPlace(), size_t(x->numel() * sizeof(phi::funcs::Real<T>)));
+    auto* out_data = out->mutable_data<phi::dtype::Real<T>>(
+        context.GetPlace(), size_t(x->numel() * sizeof(phi::dtype::Real<T>)));
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
@@ -57,7 +57,7 @@ class AngleGradKernel : public framework::OpKernel<T> {
         ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
+    auto* dout_data = d_out->data<phi::dtype::Real<T>>();
     auto* x_data = x->data<T>();
     auto* dx_data = d_x->mutable_data<T>(
         ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 03b25c6705a..e9c6c1eb7ec 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -87,19 +87,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
   int values_stride = values->dims()[values->dims().size() - 1];
 
   Tensor rwork;
-  phi::funcs::Real<T>* rwork_data = nullptr;
+  phi::dtype::Real<T>* rwork_data = nullptr;
 
   rwork.Resize(phi::make_ddim({lda * 2}));
-  rwork_data = rwork.mutable_data<phi::funcs::Real<T>>(context.GetPlace());
+  rwork_data = rwork.mutable_data<phi::dtype::Real<T>>(context.GetPlace());
 
   // call lapackEig once to compute the size of work;
   T computed_work_size;
-  phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
       jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl,
       rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info);
 
   lwork = std::max<int>(
-      1, static_cast<int>(phi::funcs::Real<T>(computed_work_size)));
+      1, static_cast<int>(phi::dtype::Real<T>(computed_work_size)));
   Tensor work;
   work.Resize(phi::make_ddim({lwork}));
   T* work_data = work.mutable_data<T>(context.GetPlace());
@@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
     T* current_values = &values_data[i * values_stride];
     T* current_rvectors = &rvector_data[i * matrix_stride];
 
-    phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
         jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data,
         ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info);
     PADDLE_ENFORCE_EQ(
@@ -207,23 +207,23 @@ class EigKernel : public framework::OpKernel<T> {
       origin_dim.push_back(last_item * 2);
       framework::DDim big_dim = phi::make_ddim(origin_dim);
 
-      real_values.mutable_data<phi::funcs::Real<T>>(big_dim,
+      real_values.mutable_data<phi::dtype::Real<T>>(big_dim,
                                                     context.GetPlace());
-      real_vectors.mutable_data<phi::funcs::Real<T>>(x->dims(),
+      real_vectors.mutable_data<phi::dtype::Real<T>>(x->dims(),
                                                      context.GetPlace());
 
-      ApplyEigKernel<DeviceContext, phi::funcs::Real<T>>(
+      ApplyEigKernel<DeviceContext, phi::dtype::Real<T>>(
           *x, &real_values, &real_vectors, context);
       auto dito = math::DeviceIndependenceTensorOperations<
-          DeviceContext, phi::funcs::Real<T>, Tout>(context);
+          DeviceContext, phi::dtype::Real<T>, Tout>(context);
 
       // 1. extract real part & imag part from real_values
       Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
       Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
 
       // 2. construct complex values
-      auto* real_part_data = real_part.data<phi::funcs::Real<T>>();
-      auto* imag_part_data = imag_part.data<phi::funcs::Real<T>>();
+      auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
+      auto* imag_part_data = imag_part.data<phi::dtype::Real<T>>();
       int out_values_numel = out_values->numel();
       platform::ForRange<DeviceContext> for_range(
           context.template device_context<DeviceContext>(), out_values_numel);
@@ -236,7 +236,7 @@ class EigKernel : public framework::OpKernel<T> {
       Tensor real_vector_trans = dito.Transpose(real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
-      ConstructComplexVectors<phi::funcs::Real<T>, Tout>(
+      ConstructComplexVectors<phi::dtype::Real<T>, Tout>(
           &out_vectors_trans, *out_values, real_vector_trans, context,
           batch_count, order);
       TransposeTwoAxis<DeviceContext, Tout>(out_vectors_trans, out_vectors,
@@ -272,7 +272,7 @@ void ComputeBackwardForComplexInput(
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<phi::funcs::Real<Tout>>();
+  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<Tout>>();
   auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
       diag_unsqueezed.dims(), context.GetPlace(),
       static_cast<size_t>(numel * sizeof(Tout)));
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
index 294794877b3..5279ec75093 100644
--- a/paddle/fluid/operators/eigh_op.h
+++ b/paddle/fluid/operators/eigh_op.h
@@ -40,7 +40,7 @@ template <typename DeviceContext, typename T>
 class EighGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     x_grad.mutable_data<T>(ctx.GetPlace());
     auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
index 59eabfb29b9..4627acc0d07 100644
--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -48,7 +48,7 @@ struct PaddleComplex<
 template <typename T>
 using PaddleCType = typename PaddleComplex<T>::type;
 template <typename T>
-using Real = typename phi::funcs::Real<T>;
+using Real = typename phi::dtype::Real<T>;
 
 static void SpiltBatchSquareMatrix(const Tensor& input,
                                    std::vector<Tensor>* output) {
@@ -144,7 +144,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_work_mem, work_mem));
 
   int64_t rwork_mem = rwork->memory_size();
-  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::funcs::Real<T>);
+  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real<T>);
   PADDLE_ENFORCE_GE(
       rwork_mem, required_rwork_mem,
       platform::errors::InvalidArgument(
@@ -154,11 +154,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_rwork_mem, rwork_mem));
 
   int info = 0;
-  phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
       'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
       static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
       work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
-      rwork->template data<phi::funcs::Real<T>>(), &info);
+      rwork->template data<phi::dtype::Real<T>>(), &info);
 
   std::string name = "framework::platform::dynload::cgeev_";
   if (framework::TransToProtoVarType(input.dtype()) ==
@@ -188,10 +188,10 @@ class EigvalsKernel : public framework::OpKernel<T> {
     // query workspace size
     T qwork;
     int info;
-    phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
         'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(),
         static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1,
-        static_cast<Real<T>*>(NULL), &info);
+        static_cast<phi::dtype::Real<T>*>(NULL), &info);
     int64_t lwork = static_cast<int64_t>(qwork);
 
     Tensor work, rwork;
@@ -208,7 +208,7 @@ class EigvalsKernel : public framework::OpKernel<T> {
     }
     if (framework::IsComplexType(
             framework::TransToProtoVarType(input->dtype()))) {
-      rwork.mutable_data<phi::funcs::Real<T>>(phi::make_ddim({n_dim << 1}),
+      rwork.mutable_data<phi::dtype::Real<T>>(phi::make_ddim({n_dim << 1}),
                                               ctx.GetPlace());
     }
 
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 33b68d68992..567a69f383d 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace paddle
 
 DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+                            PT_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index a4c3d1c81fb..3cbbc62e7be 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -46,7 +46,7 @@ template <typename DeviceContext, typename T>
 class LstsqCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
 
     const Tensor& x = *context.Input<Tensor>("X");
     auto y = context.Input<Tensor>("Y");
@@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
                               &rank_32, &wkopt, lwork, &rwkopt, &info);
     }
 
-    lwork = std::max<int>(1, static_cast<int>(phi::funcs::Real<T>(wkopt)));
+    lwork = std::max<int>(1, static_cast<int>(phi::dtype::Real<T>(wkopt)));
     Tensor work;
     work.Resize(phi::make_ddim({lwork}));
     T* work_data = work.mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 9b6ebf73d9b..1ade2190bb9 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -63,7 +63,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto dito =
@@ -123,7 +123,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
     for (auto i = 0; i < batch_size; i++) {
       auto *value_data = out_value + i * values_stride;
       auto *input_data = input_vector + i * vector_stride;
-      phi::funcs::lapackEigh<T, phi::funcs::Real<T>>(
+      phi::funcs::lapackEigh<T, phi::dtype::Real<T>>(
           jobz, uplo, n, input_data, lda, value_data, work_data, lwork,
           rwork_data, lrwork, iwork_data, liwork, &info);
       CheckEighResult(i, info);
@@ -151,7 +151,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -233,7 +233,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     }
   }
 
-  using ValueType = phi::funcs::Real<T>;
+  using ValueType = phi::dtype::Real<T>;
   inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
                         cublasFillMode_t uplo, int n, const T *A, int lda,
                         const ValueType *W, int *lwork) const;
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 38692a64611..9994ccc10cb 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
                                                        size_t num_rows,
                                                        size_t row_size, T init,
                                                        BinaryOp op) {
-  using RealT = phi::funcs::Real<T>;
+  using RealT = phi::dtype::Real<T>;
   constexpr auto kSharedBufferSize =
       framework::IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
   __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 5e841a097fe..a57a8d5cf8b 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -56,13 +56,13 @@ class QrGPUKernel : public framework::OpKernel<T> {
     int tau_stride = min_mn;
 
     if (compute_q) {
-      q.mutable_data<phi::funcs::Real<T>>(
+      q.mutable_data<phi::dtype::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
     }
-    r.mutable_data<phi::funcs::Real<T>>(
+    r.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     auto dito =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
@@ -71,9 +71,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
     // Note: allocate temporary tensors because of lacking in-place operatios.
     // Prepare qr
     Tensor qr;
-    qr.mutable_data<phi::funcs::Real<T>>(
+    qr.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * m * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * m * n * sizeof(phi::dtype::Real<T>)));
     // BatchedGeqrf performs computation in-place and 'qr' must be a copy of
     // input
     paddle::framework::TensorCopy(x, context.GetPlace(), &qr);
@@ -126,7 +126,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
           for (int i = 0; i < batch_size; ++i) {
             memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride),
                          dev_ctx.GetPlace(), (qr_data + i * qr_stride),
-                         qr_stride * sizeof(phi::funcs::Real<T>),
+                         qr_stride * sizeof(phi::dtype::Real<T>),
                          dev_ctx.stream());
           }
           BatchedOrgqr<platform::CUDADeviceContext, T>(
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index cef9371fea0..f09a07e96cd 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -74,19 +74,19 @@ class QrCPUKernel : public framework::OpKernel<T> {
     int q_stride = m * k;
     int r_stride = k * n;
 
-    auto* x_data = x.data<phi::funcs::Real<T>>();
+    auto* x_data = x.data<phi::dtype::Real<T>>();
     T* q_data = nullptr;
     if (compute_q) {
-      q_data = q.mutable_data<phi::funcs::Real<T>>(
+      q_data = q.mutable_data<phi::dtype::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
       memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
     }
-    auto* r_data = r.mutable_data<phi::funcs::Real<T>>(
+    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
+    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     // Implement QR by calling Eigen
     for (int i = 0; i < batch_size; ++i) {
@@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     // Use a different name dA instead of dX
     framework::Tensor& dA =
         *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dA.mutable_data<phi::funcs::Real<T>>(ctx.GetPlace());
+    dA.mutable_data<phi::dtype::Real<T>>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
 
@@ -224,7 +224,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     } else {
       // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V]
       // Calculate dX and dY individually and concatenate them to get dA
-      dA.mutable_data<phi::funcs::Real<T>>(ctx.GetPlace());
+      dA.mutable_data<phi::dtype::Real<T>>(ctx.GetPlace());
 
       auto Y = dito.Slice(A, {-1}, {m}, {n});
       auto U = dito.Slice(R, {-1}, {0}, {m});
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 1f3691978b5..28a8484f539 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace paddle
 
 DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+                            PT_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index bcb3ee44f04..166f49999d5 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -105,7 +105,7 @@ struct RealMulComplexFunctor {
                                         "The image part of y must to be 0"
                                         "but got [%d]",
                                         y.imag));
-    return platform::complex<phi::funcs::Real<T>>(x.real * y.real,
+    return platform::complex<phi::dtype::Real<T>>(x.real * y.real,
                                                   x.imag * y.real);
   }
 };
@@ -391,11 +391,11 @@ struct DeviceIndependenceTensorOperations {
   // batch_diag for CPU only
   Tensor BatchDiag(const Tensor& x, int batch) {
     Tensor out;
-    auto* x_data = x.data<phi::funcs::Real<T>>();
+    auto* x_data = x.data<phi::dtype::Real<T>>();
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<phi::funcs::Real<T>>(
+    auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+        static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
     auto x_dims = x.dims();
     int num_dims = x_dims.size();
@@ -661,9 +661,9 @@ struct DeviceIndependenceTensorOperations {
   Tensor Real(const Tensor& x) {
     Tensor out;
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<phi::funcs::Real<T>>(
+    auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+        static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(numel);
     phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index f5e451ac705..42a847206a3 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel<T> {
     int col_u = full ? rows : k;
     int col_v = full ? cols : k;
     int batches = numel / (rows * cols);
-    auto* U_out = U->mutable_data<phi::funcs::Real<T>>(
+    auto* U_out = U->mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batches * rows * col_u * sizeof(phi::funcs::Real<T>)));
-    auto* VH_out = VH->mutable_data<phi::funcs::Real<T>>(
+        size_t(batches * rows * col_u * sizeof(phi::dtype::Real<T>)));
+    auto* VH_out = VH->mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batches * col_v * cols * sizeof(phi::funcs::Real<T>)));
-    auto* S_out = S->mutable_data<phi::funcs::Real<T>>(
-        context.GetPlace(), size_t(batches * k * sizeof(phi::funcs::Real<T>)));
+        size_t(batches * col_v * cols * sizeof(phi::dtype::Real<T>)));
+    auto* S_out = S->mutable_data<phi::dtype::Real<T>>(
+        context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real<T>)));
     /*SVD Use the Eigen Library*/
     math::BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
   }
diff --git a/paddle/phi/common/type_traits.h b/paddle/phi/common/type_traits.h
new file mode 100644
index 00000000000..ef894eee468
--- /dev/null
+++ b/paddle/phi/common/type_traits.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+
+namespace phi {
+namespace dtype {
+
+template <bool B, typename T>
+struct cond {
+  static constexpr bool value = B;
+  using type = T;
+};
+
+template <bool B, typename TrueF, typename FalseF>
+struct eval_if {
+  using type = typename TrueF::type;
+};
+
+template <typename TrueF, typename FalseF>
+struct eval_if<false, TrueF, FalseF> {
+  using type = typename FalseF::type;
+};
+
+template <bool B, typename T, typename F>
+using eval_if_t = typename eval_if<B, T, F>::type;
+
+template <typename Head, typename... Tail>
+struct select {
+  using type = eval_if_t<Head::value, Head, select<Tail...>>;
+};
+
+template <typename T>
+struct select<T> {
+  using type = T;
+};
+
+template <bool B, typename T>
+struct select<cond<B, T>> {
+  // last one had better be true!
+  static_assert(B, "No match select type!");
+  using type = T;
+};
+
+template <typename Head, typename... Tail>
+using select_t = typename select<Head, Tail...>::type;
+
+// runtime real and complex type conversion
+
+template <typename T>
+using Real = select_t<cond<std::is_same<T, complex<float>>::value, float>,
+                      cond<std::is_same<T, complex<double>>::value, double>,
+                      T>;
+
+template <typename T>
+using Complex = select_t<cond<std::is_same<T, float>::value, complex<float>>,
+                         cond<std::is_same<T, double>::value, complex<double>>,
+                         T>;
+
+inline DataType ToReal(DataType dtype) {
+  switch (dtype) {
+    case phi::DataType::COMPLEX64:
+      return phi::DataType::FLOAT32;
+    case phi::DataType::COMPLEX128:
+      return phi::DataType::FLOAT64;
+    default:
+      return dtype;
+  }
+}
+
+inline DataType ToComplex(DataType dtype) {
+  switch (dtype) {
+    case phi::DataType::FLOAT32:
+      return phi::DataType::COMPLEX64;
+    case phi::DataType::FLOAT64:
+      return phi::DataType::COMPLEX128;
+    default:
+      return dtype;
+  }
+}
+
+}  // namespace dtype
+}  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 983e0162264..fbd9259a83f 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
@@ -51,6 +52,12 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
   out->share_meta(x);
 }
 
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_layout(x.layout());
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a2d779e0f70..3c0628981af 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -39,6 +39,8 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
                                  int axis,
                                  MetaTensor* out);
 
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index efe7d090405..9f89fc27a71 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -25,9 +25,9 @@ template <typename T, typename Context>
 void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  ctx.template Alloc<phi::funcs::Real<T>>(
-      out, size_t(x.numel() * sizeof(phi::funcs::Real<T>)));
-  auto* out_data = out->data<phi::funcs::Real<T>>();
+  ctx.template Alloc<phi::dtype::Real<T>>(
+      out, size_t(x.numel() * sizeof(phi::dtype::Real<T>)));
+  auto* out_data = out->data<phi::dtype::Real<T>>();
 
   phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 801502e1673..859d5a84527 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -37,11 +37,15 @@ PD_REGISTER_KERNEL(real,
                    ALL_LAYOUT,
                    phi::RealKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
 
 PD_REGISTER_KERNEL(imag,
                    CPU,
                    ALL_LAYOUT,
                    phi::ImagKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h
index 86dbdd099ec..8b292cb5dc5 100644
--- a/paddle/phi/kernels/funcs/complex_functors.h
+++ b/paddle/phi/kernels/funcs/complex_functors.h
@@ -20,56 +20,12 @@ limitations under the License. */
 #include <type_traits>
 
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/hostdevice.h"
 
 namespace phi {
 namespace funcs {
 
-template <bool B, typename T>
-struct cond {
-  static constexpr bool value = B;
-  using type = T;
-};
-
-template <bool B, typename TrueF, typename FalseF>
-struct eval_if {
-  using type = typename TrueF::type;
-};
-
-template <typename TrueF, typename FalseF>
-struct eval_if<false, TrueF, FalseF> {
-  using type = typename FalseF::type;
-};
-
-template <bool B, typename T, typename F>
-using eval_if_t = typename eval_if<B, T, F>::type;
-
-template <typename Head, typename... Tail>
-struct select {
-  using type = eval_if_t<Head::value, Head, select<Tail...>>;
-};
-
-template <typename T>
-struct select<T> {
-  using type = T;
-};
-
-template <bool B, typename T>
-struct select<cond<B, T>> {
-  // last one had better be true!
-  static_assert(B, "No match select type!");
-  using type = T;
-};
-
-template <typename Head, typename... Tail>
-using select_t = typename select<Head, Tail...>::type;
-
-template <typename T>
-using Real =
-    select_t<cond<std::is_same<T, phi::dtype::complex<float>>::value, float>,
-             cond<std::is_same<T, phi::dtype::complex<double>>::value, double>,
-             T>;
-
 template <typename T, typename RealT>
 using Complex = typename std::enable_if<!std::is_same<T, RealT>::value>::type;
 
@@ -91,9 +47,9 @@ template <typename T, typename Enable = void>
 struct RealFunctor;
 
 template <typename T>
-struct RealFunctor<T, Complex<T, Real<T>>> {
+struct RealFunctor<T, Complex<T, dtype::Real<T>>> {
  public:
-  RealFunctor(const T* input, Real<T>* output, int64_t numel)
+  RealFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -102,7 +58,7 @@ struct RealFunctor<T, Complex<T, Real<T>>> {
 
  private:
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
@@ -110,8 +66,8 @@ template <typename T, typename Enable = void>
 struct ImagFunctor;
 
 template <typename T>
-struct ImagFunctor<T, Complex<T, Real<T>>> {
-  ImagFunctor(const T* input, Real<T>* output, int64_t numel)
+struct ImagFunctor<T, Complex<T, dtype::Real<T>>> {
+  ImagFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -119,7 +75,7 @@ struct ImagFunctor<T, Complex<T, Real<T>>> {
   }
 
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
@@ -127,8 +83,8 @@ template <typename T, typename Enable = void>
 struct AbsFunctor;
 
 template <typename T>
-struct AbsFunctor<T, Complex<T, Real<T>>> {
-  AbsFunctor(const T* input, Real<T>* output, int64_t numel)
+struct AbsFunctor<T, Complex<T, dtype::Real<T>>> {
+  AbsFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -136,12 +92,12 @@ struct AbsFunctor<T, Complex<T, Real<T>>> {
   }
 
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
 template <typename T>
-struct AbsFunctor<T, NoComplex<T, Real<T>>> {
+struct AbsFunctor<T, NoComplex<T, dtype::Real<T>>> {
   AbsFunctor(const T* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
@@ -203,7 +159,10 @@ struct AbsGradCUDAFunctor<phi::dtype::complex<double>> {
 
 template <typename T>
 struct AbsGradFunctor {
-  AbsGradFunctor(const Real<T>* dout, const T* x, T* output, int64_t numel)
+  AbsGradFunctor(const dtype::Real<T>* dout,
+                 const T* x,
+                 T* output,
+                 int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -214,7 +173,7 @@ struct AbsGradFunctor {
     }
   }
 
-  const Real<T>* dout_;
+  const dtype::Real<T>* dout_;
   const T* x_;
   T* output_;
   int64_t numel_;
@@ -334,8 +293,8 @@ template <typename T, typename Enable = void>
 struct RealToComplexFunctor;
 
 template <typename T>
-struct RealToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealToComplexFunctor(const Real<T>* input, T* output, int64_t numel)
+struct RealToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  RealToComplexFunctor(const dtype::Real<T>* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -343,7 +302,7 @@ struct RealToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = 0;
   }
 
-  const Real<T>* input_;
+  const dtype::Real<T>* input_;
   T* output_;
   int64_t numel_;
 };
@@ -352,8 +311,8 @@ template <typename T, typename Enable = void>
 struct ImagToComplexFunctor;
 
 template <typename T>
-struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  ImagToComplexFunctor(const Real<T>* input, T* output, int64_t numel)
+struct ImagToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  ImagToComplexFunctor(const dtype::Real<T>* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -361,7 +320,7 @@ struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = input_[idx];
   }
 
-  const Real<T>* input_;
+  const dtype::Real<T>* input_;
   T* output_;
   int64_t numel_;
 };
@@ -370,9 +329,9 @@ template <typename T, typename Enable = void>
 struct RealImagToComplexFunctor;
 
 template <typename T>
-struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealImagToComplexFunctor(const Real<T>* input_real,
-                           const Real<T>* input_imag,
+struct RealImagToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  RealImagToComplexFunctor(const dtype::Real<T>* input_real,
+                           const dtype::Real<T>* input_imag,
                            T* output,
                            int64_t numel)
       : input_real_(input_real),
@@ -385,8 +344,8 @@ struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = input_imag_[idx];
   }
 
-  const Real<T>* input_real_;
-  const Real<T>* input_imag_;
+  const dtype::Real<T>* input_real_;
+  const dtype::Real<T>* input_imag_;
   T* output_;
   int64_t numel_;
 };
@@ -423,8 +382,8 @@ struct AngleFunctor;
 
 // angel function for complex
 template <typename T>
-struct AngleFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  AngleFunctor(const T* input, phi::funcs::Real<T>* output, int64_t numel)
+struct AngleFunctor<T, phi::funcs::Complex<T, dtype::Real<T>>> {
+  AngleFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -432,13 +391,13 @@ struct AngleFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
   }
 
   const T* input_;
-  phi::funcs::Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
 // angel function for real
 template <typename T>
-struct AngleFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
+struct AngleFunctor<T, phi::funcs::NoComplex<T, dtype::Real<T>>> {
   AngleFunctor(const T* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
@@ -456,25 +415,22 @@ struct AngleGradFunctor;
 
 // angle grad for complex
 template <typename T>
-struct AngleGradFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  AngleGradFunctor(const phi::funcs::Real<T>* dout,
-                   const T* x,
-                   T* dx,
-                   int64_t numel)
+struct AngleGradFunctor<T, phi::funcs::Complex<T, dtype::Real<T>>> {
+  AngleGradFunctor(const dtype::Real<T>* dout, const T* x, T* dx, int64_t numel)
       : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
     if (x_[idx] == T(0)) {
       dx_[idx] = T(0);
     } else {
-      const phi::funcs::Real<T> r_square =
+      const phi::dtype::Real<T> r_square =
           x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
       dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
                    dout_[idx] * x_[idx].real / r_square);
     }
   }
 
-  const phi::funcs::Real<T>* dout_;
+  const phi::dtype::Real<T>* dout_;
   const T* x_;
   T* dx_;
   int64_t numel_;
@@ -482,16 +438,13 @@ struct AngleGradFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
 
 // angle grad for real
 template <typename T>
-struct AngleGradFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
-  AngleGradFunctor(const phi::funcs::Real<T>* dout,
-                   const T* x,
-                   T* dx,
-                   int64_t numel)
+struct AngleGradFunctor<T, phi::funcs::NoComplex<T, dtype::Real<T>>> {
+  AngleGradFunctor(const dtype::Real<T>* dout, const T* x, T* dx, int64_t numel)
       : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
 
-  const phi::funcs::Real<T>* dout_;
+  const dtype::Real<T>* dout_;
   const T* x_;
   T* dx_;
   int64_t numel_;
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index e122e6b1e9c..5c424316a83 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -27,14 +27,14 @@ template <typename T, typename Enable = void>
 struct CudaAbsFunctor;
 
 template <typename T>
-struct CudaAbsFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  __device__ __forceinline__ phi::funcs::Real<T> operator()(const T x) const {
+struct CudaAbsFunctor<T, phi::funcs::Complex<T, phi::dtype::Real<T>>> {
+  __device__ __forceinline__ phi::dtype::Real<T> operator()(const T x) const {
     return abs(x);
   }
 };
 
 template <typename T>
-struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
+struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::dtype::Real<T>>> {
   __device__ __forceinline__ T operator()(const T x) const {
     return std::abs(x);
   }
@@ -42,12 +42,12 @@ struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
 
 template <typename T, typename Context>
 void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
-  ctx.template Alloc<phi::funcs::Real<T>>(out);
+  ctx.template Alloc<phi::dtype::Real<T>>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
   auto functor = CudaAbsFunctor<T>();
 
-  funcs::ElementwiseKernel<phi::funcs::Real<T>>(ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<phi::dtype::Real<T>>(ctx, ins, &outs, functor);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index d0b086718a4..e03e079581a 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -38,11 +38,15 @@ PD_REGISTER_KERNEL(real,
                    ALL_LAYOUT,
                    phi::RealKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
 
 PD_REGISTER_KERNEL(imag,
                    GPU,
                    ALL_LAYOUT,
                    phi::ImagKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
index 78c25200bbd..9dad40b57c9 100644
--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -47,7 +47,7 @@ void AbsGradKernel(const Context& ctx,
                    const DenseTensor& dout,
                    DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* x_data = x.data<T>();
 
   ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
index a10481284b1..03896a2353d 100644
--- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -24,7 +24,7 @@ void RealGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
@@ -38,7 +38,7 @@ void ImagGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index ff5cf86ed2e..72b13288339 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -39,8 +39,8 @@ void RealKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
-      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+  auto* out_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
   phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
@@ -53,8 +53,8 @@ void ImagKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
-      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+  auto* out_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
   phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc
index c962c68b4d5..5a1b41d796d 100644
--- a/paddle/phi/tests/common/test_data_type.cc
+++ b/paddle/phi/tests/common/test_data_type.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/type_traits.h"
 
 namespace phi {
 namespace tests {
@@ -71,5 +72,20 @@ TEST(DataType, OStream) {
   }
 }
 
+TEST(TypeTraits, Complex) {
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX64),
+            phi::DataType::FLOAT32);
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX128),
+            phi::DataType::FLOAT64);
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::FLOAT32), phi::DataType::FLOAT32);
+
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT32),
+            phi::DataType::COMPLEX64);
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT64),
+            phi::DataType::COMPLEX128);
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::COMPLEX64),
+            phi::DataType::COMPLEX64);
+}
+
 }  // namespace tests
 }  // namespace phi
-- 
GitLab


From 90ab7403753acad5c93b425f6a909a526aa57a3d Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Wed, 2 Mar 2022 15:11:42 +0800
Subject: [PATCH 047/272] [KP] Activation op registration for XPU2. part 1/2
 (#40002)

---
 .../{activation_op.cu => activation_op.kps}   | 64 +++++++++++++++++++
 .../platform/device/xpu/xpu_op_kpfirst_list.h | 26 ++++++++
 2 files changed, 90 insertions(+)
 rename paddle/fluid/operators/{activation_op.cu => activation_op.kps} (94%)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.kps
similarity index 94%
rename from paddle/fluid/operators/activation_op.cu
rename to paddle/fluid/operators/activation_op.kps
index e578ad899e7..e1afb3919f8 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.kps
@@ -1861,3 +1861,67 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
+
+#ifdef PADDLE_WITH_XPU_KP
+#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
+                                       grad_functor)                           \
+  REGISTER_OP_KERNEL(                                                          \
+      act_type, KP, plat::XPUPlace,                                            \
+      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
+  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
+                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
+                                                   ops::grad_functor<float>>);
+
+REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                               CudaLeakyReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor,
+                               CudaReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                               CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
+                               CudaReciprocalGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
+                               CudaSoftplusGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
+                               CudaHardSwishGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
+                               CudaCELUGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                               CudaSqrtGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
+                               CudaSquareGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
+                               CudaSiluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
+                               CudaLogSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
+                               CudaSoftShrinkGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
+                               CudaLog1pGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
+                               CudaBReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
+                               CudaSoftReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
+                               CudaSoftsignGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
+                               CudaRelu6GradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
+                               CudaHardShrinkGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
+                               CudaHardSigmoidFunctor,
+                               CudaHardSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
+                               CudaSwishGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
+                               CudaThresholdedReluFunctor,
+                               CudaThresholdedReluGradFunctor);
+
+#endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index f79ef8505d8..c5dff84723c 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -30,6 +30,32 @@ XPUOpMap& get_kp_ops() {
   static XPUOpMap s_xpu_kp_kernels{
       {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      // activation op
+      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"celu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"silu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softshrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"ceil", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"floor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log1p", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"brelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"soft_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softsign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_sigmoid",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
-- 
GitLab


From 244ae318c2fbfea0ab4315a17f6e6296c6be2624 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 2 Mar 2022 15:24:36 +0800
Subject: [PATCH 048/272] [fleet_executor] Add entrance of FleetExecutor in
 AnalysisPredictor for distributed inference (#39992)

---
 .../distributed/fleet_executor/carrier.cc     |  24 +-
 .../distributed/fleet_executor/carrier.h      |   7 +-
 .../fleet_executor/fleet_executor.cc          |  48 ++-
 .../fleet_executor/fleet_executor.h           |  10 +-
 .../distributed/fleet_executor/task_node.cc   |  11 +-
 .../distributed/fleet_executor/task_node.h    |   2 +-
 paddle/fluid/inference/api/analysis_config.cc |   3 +
 .../fluid/inference/api/analysis_predictor.cc | 289 +++++++++++++++++-
 .../fluid/inference/api/analysis_predictor.h  |  59 ++++
 .../inference/api/paddle_analysis_config.h    |  57 ++++
 .../fluid/inference/tests/api/CMakeLists.txt  |   6 +
 .../tests/api/analyzer_dist_model_tester.cc   |  72 +++++
 paddle/fluid/pybind/bind_fleet_executor.cc    |   2 +-
 paddle/fluid/pybind/inference_api.cc          |  19 +-
 python/paddle/fluid/executor.py               |   5 +-
 15 files changed, 581 insertions(+), 33 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 56d8da3eca4..0d5d328fd32 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
+
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -46,7 +48,8 @@ void Carrier::Init(
     const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
     const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
     const framework::ProgramDesc& program, framework::Scope* scope,
-    int64_t num_micro_batches, const platform::Place& place) {
+    int64_t num_micro_batches, const platform::Place& place,
+    const std::vector<std::string>& inference_root_scope_vars) {
   rank_ = rank;
   interceptor_id_to_rank_ = interceptor_id_to_rank;
   interceptor_id_to_node_ = interceptor_id_to_node;
@@ -60,7 +63,7 @@ void Carrier::Init(
   microbatch_scopes_.resize(num_micro_batches);
   for (int i = 0; i < num_micro_batches; ++i) {
     microbatch_scopes_[i] = &minibatch_scope_->NewScope();
-    CopyParameters(i, program);
+    CopyParameters(i, program, inference_root_scope_vars);
   }
 
   // TODO(fleet_exe dev): thread pool
@@ -80,12 +83,23 @@ void Carrier::Release() {
 
 Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
 
-void Carrier::CopyParameters(int microbatch_id,
-                             const framework::ProgramDesc& program) {
+void Carrier::CopyParameters(
+    int microbatch_id, const framework::ProgramDesc& program,
+    const std::vector<std::string>& inference_root_scope_vars) {
   auto& global_block = program.Block(0);
 
+  std::map<std::string, int> inference_root_scope_var_map;
+  for (auto var_name : inference_root_scope_vars) {
+    inference_root_scope_var_map.insert({var_name, 1});
+  }
   for (auto& var : global_block.AllVars()) {
-    if (var->Persistable() && microbatch_id == 0) {
+    std::string var_name = var->Name();
+    bool force_root = inference_root_scope_var_map.find(var_name) !=
+                      inference_root_scope_var_map.end();
+    if (force_root) {
+      VLOG(4) << var_name << " will be forced to be created in the root scope.";
+    }
+    if ((var->Persistable() || force_root) && microbatch_id == 0) {
       auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
       VLOG(5) << "Create persistable var: " << var->Name()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 9a74fa78c0e..d35a3260915 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -57,9 +57,12 @@ class Carrier final {
       const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
       const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
       const framework::ProgramDesc& program, framework::Scope* scope,
-      int64_t num_micro_batches, const platform::Place& place);
+      int64_t num_micro_batches, const platform::Place& place,
+      const std::vector<std::string>& inference_root_scope_vars = {});
 
-  void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
+  void CopyParameters(
+      int microbatch_id, const framework::ProgramDesc& program,
+      const std::vector<std::string>& inference_root_scope_vars);
 
   void Release();
   void Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 457549a27b4..e946d78550f 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <algorithm>
 
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
@@ -52,7 +53,8 @@ void FleetExecutor::Init(
     const std::string& carrier_id, const framework::ProgramDesc& program_desc,
     framework::Scope* scope, const platform::Place& place,
     int64_t num_micro_batches, const std::vector<TaskNode*>& task_nodes,
-    const std::unordered_map<int64_t, int64_t>& task_id_to_rank) {
+    const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+    const std::vector<std::string>& inference_root_scope_vars) {
   PADDLE_ENFORCE_GT(task_nodes.size(), 0,
                     platform::errors::InvalidArgument(
                         "Fleet executor is inited with empty task node"));
@@ -64,6 +66,37 @@ void FleetExecutor::Init(
     }
   }
   auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
+  // NOTE: For inference, the vars in inference_root_scope_vars
+  // shouldn't be deleted during inf, for that they may be the result of the
+  // inf. If they are GCed, it will cause error during ZeroCopy the result.
+  std::vector<const framework::OperatorBase*> changed_ops;
+  for (auto pair : unused_vars) {
+    const framework::OperatorBase* op = pair.first;
+    std::vector<std::string> unused = pair.second;
+    for (auto name : inference_root_scope_vars) {
+      auto iter = std::find(unused.begin(), unused.end(), name);
+      if (iter != unused.end()) {
+        VLOG(3) << "Removing var: [" << name
+                << "] from the unused vars list of op: [" << op->Type() << "]";
+        unused.erase(iter);
+        if (std::find(changed_ops.begin(), changed_ops.end(), op) ==
+            changed_ops.end()) {
+          // record the op whose unused vars have been updated
+          changed_ops.emplace_back(op);
+        }
+      }
+    }
+    // update the unused vars list in the map
+    unused_vars[op] = unused;
+  }
+  for (auto op : changed_ops) {
+    auto iter = unused_vars.find(op);
+    if (iter->second.empty()) {
+      // remove those ops in the map that have empty unused vars list
+      VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map.";
+      unused_vars.erase(iter);
+    }
+  }
   runtime_graph_ = std::make_shared<RuntimeGraph>();
   std::unordered_map<int64_t, TaskNode*> interceptor_id_to_task;
   for (auto task_node : task_nodes) {
@@ -82,17 +115,18 @@ void FleetExecutor::Init(
   carrier_ids_.insert(carrier_id);
   // Set current running carrier
   GlobalVal<std::string>::Set(new std::string(carrier_id));
-  InitCarrier(carrier, scope, place, num_micro_batches, program_desc);
+  InitCarrier(carrier, scope, place, num_micro_batches, program_desc,
+              inference_root_scope_vars);
   GlobalVal<MessageBus>::Get()->Barrier();
 }
 
-void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope,
-                                const platform::Place& place,
-                                int64_t num_micro_batches,
-                                const framework::ProgramDesc& program_desc) {
+void FleetExecutor::InitCarrier(
+    Carrier* carrier, framework::Scope* scope, const platform::Place& place,
+    int64_t num_micro_batches, const framework::ProgramDesc& program_desc,
+    const std::vector<std::string>& inference_root_scope_vars) {
   carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(),
                 runtime_graph_->interceptor_id_to_node(), program_desc, scope,
-                num_micro_batches, place);
+                num_micro_batches, place, inference_root_scope_vars);
 }
 
 void FleetExecutor::InitMessageBus() {
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index fa65309127b..ccdb3dcc459 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -42,15 +42,17 @@ class FleetExecutor final {
             const framework::ProgramDesc& program_desc, framework::Scope* scope,
             const platform::Place& place, int64_t num_micro_batches,
             const std::vector<TaskNode*>& task_nodes,
-            const std::unordered_map<int64_t, int64_t>& task_id_to_rank);
+            const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+            const std::vector<std::string>& inference_root_scope_vars = {});
   void Run(const std::string& carrier_id);
 
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
   void InitMessageBus();
-  void InitCarrier(Carrier* carrier, framework::Scope* scope,
-                   const platform::Place& place, int64_t num_micro_batches,
-                   const framework::ProgramDesc& program_desc);
+  void InitCarrier(
+      Carrier* carrier, framework::Scope* scope, const platform::Place& place,
+      int64_t num_micro_batches, const framework::ProgramDesc& program_desc,
+      const std::vector<std::string>& inference_root_scope_vars = {});
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
   std::unordered_set<std::string> carrier_ids_;
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 6de7038b323..95e4c733059 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -52,11 +52,20 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
   program_ = program;
 }
 
-void TaskNode::Init() {
+void TaskNode::Init(bool use_feed_fetch_ops) {
+  if (!use_feed_fetch_ops) {
+    VLOG(3) << "TaskNode will be inited without feed and fetch ops";
+  }
   if (ops_.empty()) {
     // Q (for fleet executor dev): should we need another reset funct?
     VLOG(3) << "Task node will be inited by calling Init().";
     for (const auto& op_desc : program_->Block(0).AllOps()) {
+      if (!use_feed_fetch_ops &&
+          (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+        VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], "
+                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
+        continue;
+      }
       ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
     }
     for (const auto& op : ops_vec_) {
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index b655d140d37..4764d4fd4af 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -46,7 +46,7 @@ class TaskNode final {
   ~TaskNode() = default;
 
   void SetProgram(paddle::framework::ProgramDesc* program);
-  void Init();
+  void Init(bool use_feed_fetch_ops = true);
   int64_t rank() const { return rank_; }
   int64_t task_id() const { return task_id_; }
   int32_t role() const { return role_; }
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index fd2ccffae3b..9c33d700306 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -274,6 +274,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(ipu_available_memory_proportion_);
   CP_MEMBER(ipu_enable_half_partial_);
 
+  // fleet exe related
+  CP_MEMBER(dist_config_);
+
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cd6e3a3c759..5492c3b0d26 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/version.h"
@@ -47,6 +48,14 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/utils/string/split.h"
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
@@ -186,14 +195,14 @@ bool AnalysisPredictor::Init(
     return false;
   }
 
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+
   // Prepare executor, create local variables.
   if (!PrepareExecutor()) {
     return true;
   }
 
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
-
   return true;
 }
 
@@ -359,6 +368,13 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
+    return PrepareFleetExecutor();
+  }
+#endif
   DisablePrepareDataOpt(inference_program_, 0, false);
 
   executor_->Prepare(sub_scope_, *inference_program_, 0,
@@ -371,6 +387,226 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+bool AnalysisPredictor::PrepareFleetExecutor() {
+  VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
+  if (config_.dist_config().nranks() > 1 && !CommInit()) {
+    return false;
+  }
+  task_node_.reset(new distributed::TaskNode(inference_program_.get(),
+                                             config_.dist_config().rank()));
+  // With auto cut, there is no concept of pp, no need to add dependency.
+  task_node_->SetType("Compute");
+  task_node_->Init(config_.use_feed_fetch_ops_enabled());
+  executor_desc_ = distributed::FleetExecutorDesc();
+  executor_desc_.set_cur_rank(config_.dist_config().rank());
+  std::unordered_map<int64_t, int64_t> id_to_rank;
+  for (int i = 0; i < config_.dist_config().nranks(); ++i) {
+    distributed::RankInfo *rank_info = executor_desc_.add_cluster_info();
+    rank_info->set_rank(i);
+    rank_info->set_ip_port(config_.dist_config().trainer_endpoints()[i]);
+    id_to_rank.insert({i, i});
+  }
+  fleet_exe_.reset(new distributed::FleetExecutor(executor_desc_));
+  // NOTE: Vars of feed fetch ops are not persistable,
+  // which will result in that those vars will be created in
+  // the subscope (microscope) in fleet executor. This will
+  // cause that the GetInputTensor/GetOutputTensor funct
+  // in analysis predictor cannot find those vars in the scope
+  // returned by the DistModel, since DistModel only return the
+  // root scope. So, those vars must  to be created in the root
+  // scope instead of in the microscope
+  std::vector<std::string> feed_fetch_vars;
+  for (auto pair : idx2feeds_) {
+    feed_fetch_vars.emplace_back(pair.second);
+  }
+  for (auto pair : idx2fetches_) {
+    feed_fetch_vars.emplace_back(pair.second);
+  }
+  fleet_exe_->Init(config_.dist_config().carrier_id(),
+                   *(inference_program_.get()), scope_.get(), place_, 1,
+                   {task_node_.get()}, id_to_rank, feed_fetch_vars);
+  return true;
+}
+
+bool AnalysisPredictor::CommInit() {
+  std::map<int64_t, std::vector<int64_t>> ring_id_to_ranks{};
+  std::map<int64_t, std::vector<int64_t>> rank_to_ring_ids{};
+  if (!LoadConverterConfig(&ring_id_to_ranks, &rank_to_ring_ids)) {
+    VLOG(3) << "Load converter config failed, DistModel init failed.";
+    return false;
+  }
+  std::unique_ptr<framework::ProgramDesc> comm_init_program(
+      new framework::ProgramDesc());
+  framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0);
+  std::vector<int64_t> &ring_ids =
+      rank_to_ring_ids[config_.dist_config().rank()];
+  int64_t order = 0;
+  std::string var_name_base = "comm_init_";
+  for (int64_t ring_id : ring_ids) {
+    VLOG(3) << "Init comm for ring id: " << ring_id;
+    int64_t ranks_in_group = ring_id_to_ranks[ring_id].size();
+    int64_t rank_in_group = 0;
+    std::vector<int64_t> &ranks = ring_id_to_ranks[ring_id];
+    for (int64_t rank : ranks) {
+      if (config_.dist_config().rank() == rank) {
+        break;
+      }
+      rank_in_group += 1;
+    }
+    std::vector<std::string> peer_endpoints;
+    for (int64_t rank : ranks) {
+      if (config_.dist_config().rank() == rank) {
+        continue;
+      }
+      peer_endpoints.emplace_back(
+          config_.dist_config().trainer_endpoints()[rank]);
+    }
+    InsertCommOp(var_name_base + std::to_string(order), ranks_in_group,
+                 rank_in_group, peer_endpoints, comm_init_block, ring_id);
+    order += 1;
+  }
+  framework::NaiveExecutor e(place_);
+  e.CreateVariables(*comm_init_program, 0, true, scope_.get());
+  e.Prepare(scope_.get(), *comm_init_program, 0, false);
+  e.Run();
+  VLOG(3) << "Comm init successful.";
+  return true;
+}
+
+void AnalysisPredictor::InsertCommOp(
+    std::string tmp_var_name, int nranks, int rank,
+    const std::vector<std::string> &peer_endpoints, framework::BlockDesc *block,
+    int ring_id) {
+  /*
+   * tmp_var_name: the var name for var comm_id
+   * nranks: number of total ranks
+   * rank: the rank of local rank in the comm group
+   * peer_endpoints: peer's endpoints
+   * block: the block where to insert the comm ops
+   * ring_id: the ring_id to be inited
+   */
+  const std::string &endpoint = config_.dist_config().current_endpoint();
+  std::stringstream ss;
+  ss << "Init comm with tmp var: " << tmp_var_name
+     << ". The ring id is: " << ring_id << ". The group has: " << nranks
+     << " ranks. Current rank in the group is: " << rank
+     << ". The endpoint is: " << endpoint << ". Peer endpoints are: ";
+  for (auto ep : peer_endpoints) {
+    ss << ep << ", ";
+  }
+  VLOG(3) << ss.str();
+  if (config_.use_gpu()) {
+    framework::VarDesc *new_var = block->Var(tmp_var_name);
+    new_var->SetType(framework::proto::VarType::RAW);
+    new_var->SetPersistable(true);
+    framework::OpDesc *gen_nccl_id_op = block->AppendOp();
+    gen_nccl_id_op->SetType("c_gen_nccl_id");
+    gen_nccl_id_op->SetOutput("Out", {tmp_var_name});
+    gen_nccl_id_op->SetAttr("rank", rank);
+    gen_nccl_id_op->SetAttr("endpoint",
+                            config_.dist_config().current_endpoint());
+    gen_nccl_id_op->SetAttr("other_endpoints", peer_endpoints);
+    gen_nccl_id_op->SetAttr("ring_id", ring_id);
+    gen_nccl_id_op->SetAttr("op_role",
+                            static_cast<int>(framework::OpRole::kForward));
+    gen_nccl_id_op->CheckAttrs();
+    framework::OpDesc *comm_init_op = block->AppendOp();
+    comm_init_op->SetType("c_comm_init");
+    comm_init_op->SetInput("X", {tmp_var_name});
+    comm_init_op->SetAttr("rank", rank);
+    comm_init_op->SetAttr("nranks", nranks);
+    comm_init_op->SetAttr("ring_id", ring_id);
+    comm_init_op->SetAttr("op_role",
+                          static_cast<int>(framework::OpRole::kForward));
+    comm_init_op->CheckAttrs();
+  } else {
+    LOG(WARNING) << "DistModelInf doesn't init comm.";
+    // TODO(fleet exe dev): comm init for more devices
+  }
+}
+
+bool AnalysisPredictor::LoadConverterConfig(
+    std::map<int64_t, std::vector<int64_t>> *ring_id_to_ranks,
+    std::map<int64_t, std::vector<int64_t>> *rank_to_ring_ids) {
+  VLOG(3) << "Going to load converter config from: "
+          << config_.dist_config().comm_init_config() << "\n";
+  std::ifstream fin(config_.dist_config().comm_init_config(), std::ios::in);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin.is_open()), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file is normal.",
+          config_.dist_config().comm_init_config()));
+  std::string line;
+  bool ring_to_rank{true};
+  // Reading config from file, the config file should like these format
+  //  [ring_id -> ranks]
+  //  0,0,1,2,3
+  //  1,0,1
+  //  2,2,3
+  //  21,0,1
+  //  22,1,2
+  //  23,2,3
+  //  [rank -> ring_ids]
+  //  0,0,1,21
+  //  1,0,1,21,22
+  //  2,0,2,22,23
+  //  3,0,2,23
+  while (std::getline(fin, line)) {
+    std::vector<std::string> one_line = paddle::string::Split(line, ',');
+    if (one_line.size() == 1) {
+      // start a new section of the config
+      if (line == "[ring_id -> ranks]") {
+        ring_to_rank = true;
+      } else if (line == "[rank -> ring_ids]") {
+        ring_to_rank = false;
+      }
+    } else {
+      // parse key - values pairs in one section
+      int64_t key = std::stoll(one_line[0]);
+      for (size_t i = 1; i < one_line.size(); ++i) {
+        int64_t val = std::stoll(one_line[i]);
+        if (ring_to_rank) {
+          if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) {
+            ring_id_to_ranks->insert({key, std::vector<int64_t>()});
+          }
+          ring_id_to_ranks->at(key).emplace_back(val);
+        } else {
+          if (rank_to_ring_ids->find(key) == rank_to_ring_ids->end()) {
+            rank_to_ring_ids->insert({key, std::vector<int64_t>()});
+          }
+          rank_to_ring_ids->at(key).emplace_back(val);
+        }
+        // NOTE: add more configuration sections here
+      }
+    }
+  }
+  std::stringstream ss;
+  ss << "Loaded the following converter config:\n";
+  ss << "ring_id_to_ranks:\n";
+  for (auto pair : *ring_id_to_ranks) {
+    int64_t key = pair.first;
+    ss << "\t" << key << "\t->\t";
+    for (auto value : pair.second) {
+      ss << value << "\t";
+    }
+    ss << "\n";
+  }
+  ss << "rank_to_ring_ids:\n";
+  for (auto pair : *rank_to_ring_ids) {
+    int64_t key = pair.first;
+    ss << "\t" << key << "\t->\t";
+    for (auto value : pair.second) {
+      ss << value << "\t";
+    }
+    ss << "\n";
+  }
+  VLOG(3) << ss.str();
+  return true;
+}
+#endif
+
 void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #ifdef PADDLE_WITH_MKLDNN
   std::vector<std::vector<int>> inputs_shape;
@@ -946,13 +1182,24 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
+  framework::Scope *scope;
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    scope = scope_.get();
+  } else {
+    scope = executor_->scope();
+  }
+#else
+  scope = executor_->scope();
+#endif
   PADDLE_ENFORCE_NOT_NULL(
-      executor_->scope()->FindVar(name),
+      scope->FindVar(name),
       platform::errors::PreconditionNotMet(
-          "The variable named %s is not found in the scope of the exector.",
+          "The variable named %s is not found in the scope of the executor.",
           name));
   std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+      new ZeroCopyTensor(static_cast<void *>(scope)));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -985,13 +1232,24 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
+  framework::Scope *scope;
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    scope = scope_.get();
+  } else {
+    scope = executor_->scope();
+  }
+#else
+  scope = executor_->scope();
+#endif
   PADDLE_ENFORCE_NOT_NULL(
-      executor_->scope()->FindVar(name),
+      scope->FindVar(name),
       platform::errors::PreconditionNotMet(
-          "he variable named %s is not found in the scope of the exector.",
+          "The variable named %s is not found in the scope of the executor.",
           name));
   std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+      new ZeroCopyTensor(static_cast<void *>(scope)));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -1023,6 +1281,18 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    VLOG(3) << "ZeroCopyRun will use the fleet executor.";
+    inference::Timer timer;
+    timer.tic();
+    fleet_exe_->Run(config_.dist_config().carrier_id());
+    VLOG(3) << "Fleet executor inf runs once use: "
+            << std::to_string(timer.toc()) << "ms";
+    return true;
+  }
+#endif
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 #ifdef PADDLE_WITH_MKLDNN
   if (config_.use_mkldnn_) {
@@ -1035,7 +1305,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
     MkldnnPreSet(shape_vector);
   }
 #endif
-
   executor_->Run();
 
   if (config_.shape_range_info_collected()) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index a8e56101d37..8ed183dae0b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,10 @@
 #include <memory>
 #include <string>
 #include <vector>
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#endif
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
@@ -391,6 +395,53 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // fleet exe related
+
+  ///
+  /// \brief prepare for fleet executor to run
+  ///
+  /// Used in AnalysisPredictor::Init(),
+  ///
+  bool PrepareFleetExecutor();
+
+  ///
+  /// \brief init NCCL env for multi gpus inference
+  ///
+  /// Used in AnalysisPredictor::PrepareFleetExecutor()
+  ///
+  bool CommInit();
+
+  ///
+  /// \brief read the config to init NCCL env
+  ///
+  /// Used in AnalysisPredictor::CommInit()
+  ///
+  /// \param[in] ring_id_to_ranks: a ptr to ring_id_to_ranks
+  /// \param[in] rank_to_ring_ids: a ptr to rank_to_ring_ids
+  ///
+  bool LoadConverterConfig(
+      std::map<int64_t, std::vector<int64_t>> *ring_id_to_ranks,
+      std::map<int64_t, std::vector<int64_t>> *rank_to_ring_ids);
+
+  ///
+  /// \brief add ops and run them with NaiveExecutor to init NCCL env
+  ///
+  /// Used in AnalysisPredictor::CommInit()
+  ///
+  /// \param[in] tmp_var_name: var name to hold NCCL unique id
+  /// \param[in] nranks: number of ranks in one comm group
+  /// \param[in] rank: relative rank of current rank in the comm group
+  /// \param[in] peer_endpoints: group's peers' endpoints
+  /// \param[in] block: the block to insert comm ops
+  /// \param[in] ring_id: the ring id to be used to init NCCL env
+  ///
+  void InsertCommOp(std::string tmp_var_name, int nranks, int rank,
+                    const std::vector<std::string> &peer_endpoints,
+                    framework::BlockDesc *block, int ring_id);
+#endif
+
  private:
   AnalysisConfig config_;
   Argument argument_;
@@ -436,6 +487,14 @@ class AnalysisPredictor : public PaddlePredictor {
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   int clone_num_{1};
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // fleet executor related
+  distributed::FleetExecutorDesc executor_desc_;
+  std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
+  std::shared_ptr<distributed::TaskNode> task_node_;
+#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 180c028c6a6..b4a35839440 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -76,6 +76,54 @@ struct LiteNNAdapterConfig {
   LiteNNAdapterConfig& Disable();
 };
 
+struct DistConfig {
+  bool use_dist_model() const { return use_dist_model_; }
+  void EnableDistModel(bool use_dist_model) {
+    use_dist_model_ = use_dist_model;
+  }
+
+  std::vector<std::string> trainer_endpoints() const {
+    return trainer_endpoints_;
+  }
+
+  std::string current_endpoint() const { return current_endpoint_; }
+
+  void SetEndpoints(const std::vector<std::string>& trainer_endpoints,
+                    const std::string& current_endpoint) {
+    trainer_endpoints_ = trainer_endpoints;
+    current_endpoint_ = current_endpoint;
+  }
+
+  int64_t nranks() const { return nranks_; }
+
+  int64_t rank() const { return rank_; }
+
+  void SetRanks(int64_t nranks, int64_t rank) {
+    nranks_ = nranks;
+    rank_ = rank;
+  }
+
+  std::string comm_init_config() const { return comm_init_config_; }
+
+  void SetCommInitConfig(const std::string& comm_init_config) {
+    comm_init_config_ = comm_init_config;
+  }
+
+  void SetCarrierId(const std::string& carrier_id) { carrier_id_ = carrier_id; }
+
+  std::string carrier_id() const { return carrier_id_; }
+
+ protected:
+  // DistModel Inference related
+  bool use_dist_model_{false};  // whether use DistModel or not
+  std::vector<std::string> trainer_endpoints_{};  // all trainers' endpoints
+  std::string current_endpoint_{};                // current trainer's endpoint
+  int64_t nranks_{1};               // total ranks (number of trainers)
+  int64_t rank_{0};                 // rank
+  std::string comm_init_config_{};  // converter config path
+  std::string carrier_id_{"inference"};
+};
+
 ///
 /// \brief configuration manager for AnalysisPredictor.
 /// \since 1.7.0
@@ -763,6 +811,12 @@ struct PD_INFER_DECL AnalysisConfig {
 
   LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; }
 
+  void SetDistConfig(const DistConfig& dist_config) {
+    dist_config_ = dist_config;
+  }
+
+  const DistConfig& dist_config() const { return dist_config_; }
+
  protected:
   // Update the config.
   void Update();
@@ -902,6 +956,9 @@ struct PD_INFER_DECL AnalysisConfig {
   mutable bool is_valid_{true};
   std::string opt_cache_dir_;
   friend class paddle_infer::experimental::InternalUtils;
+
+  // fleet exe related
+  DistConfig dist_config_{};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 0281fd91765..8c96499a022 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -720,6 +720,12 @@ inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zeroco
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
 
+if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+    inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${OCR_INSTALL_DIR}/model)
+endif()
+
 inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
diff --git a/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc
new file mode 100644
index 00000000000..7cf6e2adfc6
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+
+TEST(test_dist_model, dist_model) {
+  std::cout << "Analysis Predictor DistModel test." << std::endl;
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model + "/__model__",
+                  FLAGS_infer_model + "/__params__");
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableUseGpu(100, 0);
+  DistConfig dist_config;
+  dist_config.SetRanks(1, 0);
+  dist_config.EnableDistModel(true);
+  dist_config.SetEndpoints({""}, "");
+  config.SetDistConfig(dist_config);
+
+  auto predictor = paddle_infer::CreatePredictor(config);
+  int batch_size = 1;
+  int channels = 1;
+  int height = 48;
+  int width = 512;
+  int nums = batch_size * channels * height * width;
+  std::cout << "Created predictor." << std::endl;
+
+  float* input = new float[nums];
+  for (int i = 0; i < nums; ++i) input[i] = 0;
+  auto input_names = predictor->GetInputNames();
+
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape({batch_size, channels, height, width});
+  input_t->CopyFromCpu(input);
+  std::cout << "Input data." << std::endl;
+
+  predictor->Run();
+  std::cout << "Zero Copy Run." << std::endl;
+
+  std::vector<float> out_data;
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  std::cout << "Output data." << std::endl;
+  delete[] input;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index b29cc10e8f5..8491d1e2249 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -168,7 +168,7 @@ void BindFleetExecutor(py::module* m) {
       .def("set_run_at_offset", &TaskNode::SetRunAtOffset)
       .def("set_type", &TaskNode::SetType)
       .def("role", &TaskNode::role)
-      .def("init", &TaskNode::Init)
+      .def("init", [](TaskNode& self) { self.Init(); })
       .def("set_program", &TaskNode::SetProgram);
 
   py::class_<DistModelConfig>(*m, "DistModelConfig")
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index eafd5baab7d..9b5041154c9 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -658,7 +658,24 @@ void BindAnalysisConfig(py::module *m) {
              return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
            },
            py::return_value_policy::reference)
-      .def("nnadapter", &AnalysisConfig::NNAdapter);
+      .def("nnadapter", &AnalysisConfig::NNAdapter)
+      .def("set_dist_config", &AnalysisConfig::SetDistConfig)
+      .def("dist_config", &AnalysisConfig::dist_config);
+
+  py::class_<DistConfig>(*m, "DistConfig")
+      .def(py::init<>())
+      .def("set_carrier_id", &DistConfig::SetCarrierId)
+      .def("set_comm_init_config", &DistConfig::SetCommInitConfig)
+      .def("set_endpoints", &DistConfig::SetEndpoints)
+      .def("set_ranks", &DistConfig::SetRanks)
+      .def("enable_dist_model", &DistConfig::EnableDistModel)
+      .def("carrier_id", &DistConfig::carrier_id)
+      .def("current_endpoint", &DistConfig::current_endpoint)
+      .def("trainer_endpoints", &DistConfig::trainer_endpoints)
+      .def("nranks", &DistConfig::nranks)
+      .def("rank", &DistConfig::rank)
+      .def("comm_init_config", &DistConfig::comm_init_config)
+      .def("use_dist_model", &DistConfig::use_dist_model);
 }
 
 void BindLiteNNAdapterConfig(py::module *m) {
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e372727b0f0..a7971763f53 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2034,8 +2034,11 @@ class Executor(object):
             fleet_opt['task_id_to_rank'] = task_id_to_rank
         place = core.Place()
         place.set_place(self.place)
+        # NOTE: the last argument is used to force create some vars in root scope,
+        # won't be used during train.
         self._fleet_executor.init(carrier_id, program.desc, scope, place,
-                                  num_micro_batches, tasks, task_id_to_rank)
+                                  num_micro_batches, tasks, task_id_to_rank,
+                                  [])
 
     def _run_using_fleet_executor(self,
                                   program=None,
-- 
GitLab


From bc113e10487115fd91cfc738c4279372eeb7c2a2 Mon Sep 17 00:00:00 2001
From: joeqiao12 <45232181+joeqiao12@users.noreply.github.com>
Date: Wed, 2 Mar 2022 15:29:24 +0800
Subject: [PATCH 049/272] add logic kernel for mlu (#39940)

---
 .../operators/controlflow/compare_op_mlu.cc   | 200 ++++++++++++++++++
 .../unittests/mlu/test_compare_op_mlu.py      | 157 ++++++++++++++
 2 files changed, 357 insertions(+)
 create mode 100644 paddle/fluid/operators/controlflow/compare_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py

diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
new file mode 100644
index 00000000000..9dc287ab76a
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class EqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_EQ, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NotEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_NE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LT, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterThanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GT, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(
+    equal, ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    not_equal, ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    less_than, ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    less_equal, ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    greater_than,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    greater_equal,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
new file mode 100644
index 00000000000..87997acce02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
@@ -0,0 +1,157 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(OpTest):
+        def setUp(self):
+            self.set_mlu()
+            self.place = paddle.MLUPlace(0)
+            x = np.random.random(size=(10, 7)).astype(typename)
+            y = np.random.random(size=(10, 7)).astype(typename)
+            out = callback(x, y)
+            self.inputs = {'X': x, 'Y': y}
+            self.outputs = {'Out': out}
+            self.op_type = op_type
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+
+        def test_output(self):
+            self.check_output_with_place(place=self.place)
+
+        def test_errors(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                a = fluid.layers.data(name='a', shape=[2], dtype='float32')
+                b = fluid.layers.data(name='b', shape=[2], dtype='float32')
+                c = fluid.layers.data(name='c', shape=[2], dtype='int16')
+                d = fluid.create_lod_tensor(np.array([[-1]]), [[1]], self.place)
+
+                op = eval("fluid.layers.%s" % self.op_type)
+                self.assertRaises(TypeError, op, x=a, y=b, axis=True)
+                self.assertRaises(TypeError, op, x=a, y=b, force_cpu=1)
+                self.assertRaises(TypeError, op, x=a, y=b, cond=1)
+                self.assertRaises(TypeError, op, x=a, y=c)
+                self.assertRaises(TypeError, op, x=c, y=a)
+                self.assertRaises(TypeError, op, x=a, y=d)
+                self.assertRaises(TypeError, op, x=d, y=a)
+                self.assertRaises(TypeError, op, x=c, y=d)
+
+        def test_dynamic_api(self):
+            paddle.disable_static()
+            paddle.set_device('mlu:0')
+            x = np.random.random(size=(10, 7)).astype(typename)
+            y = np.random.random(size=(10, 7)).astype(typename)
+            real_result = callback(x, y)
+            x = paddle.to_tensor(x, dtype=typename)
+            y = paddle.to_tensor(y, dtype=typename)
+            op = eval("paddle.%s" % (self.op_type))
+            out = op(x, y)
+            self.assertEqual((out.numpy() == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_1(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype=typename)
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 3], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
+                input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 3], dtype=typename)
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype=typename)
+                y = paddle.static.data(name='y', shape=[3, 1], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(typename)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_attr_name(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[4], dtype=typename)
+                y = fluid.layers.data(name='y', shape=[4], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x=x, y=y, name="name_%s" % (self.op_type))
+            self.assertEqual("name_%s" % (self.op_type) in out.name, True)
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float16', 'float32', 'int32', 'bool'}:
+    if _type_name == 'int32' or _type_name == 'bool':
+        create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+        continue
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 0c3f7fbcfe68bfb34b0ed5d9aad6e3a8c0cca43f Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 2 Mar 2022 15:30:09 +0800
Subject: [PATCH 050/272] Upgrade new profiler (#39984)

* add new profiler components

* fix bug

* upgrade new profiler

* fix operator.cc

* fix operator.cc

* fix cmakelists.txt

* fix bug

* fix according to pr

* fix bug

* fix cmake

* fix bug

* fix a bug

* fix bug

* fix bug
---
 paddle/fluid/framework/operator.cc            |   8 +-
 paddle/fluid/platform/profiler/CMakeLists.txt |  10 +-
 .../platform/profiler/chrometracing_logger.cc | 320 ++++++++++++++----
 .../platform/profiler/chrometracing_logger.h  |  11 +
 .../platform/profiler/cpu_utilization.cc      |  47 ++-
 .../platform/profiler/dump/CMakeLists.txt     |   3 -
 .../profiler/dump/deserialization_reader.cc   |  16 +-
 .../profiler/dump/deserialization_reader.h    |   4 +-
 .../platform/profiler/dump/nodetree.proto     |  27 +-
 .../profiler/dump/serialization_logger.cc     |  12 +
 .../profiler/dump/serialization_logger.h      |   5 +
 .../dump/test_serialization_logger.cc         |  28 +-
 .../fluid/platform/profiler/event_python.cc   | 122 +++++++
 paddle/fluid/platform/profiler/event_python.h |  26 +-
 paddle/fluid/platform/profiler/profiler.cc    |  35 +-
 paddle/fluid/platform/profiler/profiler.h     |  10 +-
 .../fluid/platform/profiler/profiler_test.cc  |  11 +-
 paddle/fluid/platform/profiler/trace_event.h  |   2 +
 18 files changed, 578 insertions(+), 119 deletions(-)
 mode change 100755 => 100644 paddle/fluid/platform/profiler/dump/serialization_logger.h
 create mode 100644 paddle/fluid/platform/profiler/event_python.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b12ad552aba..b91ee3c2d63 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
       // and different op name cost time,we set two event.
       platform::RecordEvent op_type_record_event(
           Type(), platform::TracerEventType::Operator, 1);
-      // auto op_name = platform::OpName(outputs_, Type());
-      // platform::RecordEvent op_name_record_event(
-      //     op_name, platform::TracerEventType::Operator, 1,
-      //     platform::EventRole::kUniqueOp);
+      auto op_name = platform::OpName(outputs_, Type());
+      platform::RecordEvent op_name_record_event(
+          op_name, platform::TracerEventType::Operator, 10,
+          platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
     }
 
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 5acdfa39569..c903a52530c 100755
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -2,10 +2,12 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
 cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
-cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
-cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
 add_subdirectory(dump)
+cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
+cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
 cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
+cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
 cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
+cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 7b207ea7b20..4061e2d4d49 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -18,40 +18,17 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
 namespace platform {
 
 static const char* kSchemaVersion = "1.0.0";
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
-static uint32_t num_span = 0;
-
-static int64_t nsToUs(int64_t ns) { return ns / 1000; }
-
-template <typename... Args>
-std::string string_format(const std::string& format, Args... args) {
-  int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
-               1;  // Extra space for '\0'
-  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
-                                   "Error during profiler data formatting."));
-  auto size = static_cast<size_t>(size_s);
-  auto buf = std::make_unique<char[]>(size);
-  std::snprintf(buf.get(), size, format.c_str(), args...);
-  return std::string(buf.get(), size - 1);  // exclude the '\0'
-}
-
-std::string GetStringFormatLocalTime() {
-  std::time_t rawtime;
-  std::tm* timeinfo;
-  char buf[100];
-  std::time(&rawtime);
-  timeinfo = std::localtime(&rawtime);
-  std::strftime(buf, 100, "%F-%X", timeinfo);
-  return std::string(buf);
-}
+static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
@@ -60,16 +37,19 @@ static std::string DefaultFileName() {
 }
 
 const char* ChromeTracingLogger::categary_name_[] = {
-    "operator", "dataloader", "profile_step", "cuda_runtime", "kernel",
-    "memcpy",   "memset",     "user_defined", "others"};
+    "Operator",      "Dataloader", "ProfileStep",      "CudaRuntime",
+    "Kernel",        "Memcpy",     "Memset",           "UserDefined",
+    "OperatorInner", "Forward",    "Backward",         "Optimization",
+    "Communication", "PythonOp",   "PythonUserDefined"};
 
 void ChromeTracingLogger::OpenFile() {
   output_file_stream_.open(filename_,
                            std::ofstream::out | std::ofstream::trunc);
   if (!output_file_stream_) {
-    VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
+    LOG(WARNING) << "Unable to open file for writing profiling data."
+                 << std::endl;
   } else {
-    VLOG(0) << "writing profiling data to " << filename_ << std::endl;
+    LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
   }
 }
 
@@ -122,21 +102,54 @@ void ChromeTracingLogger::LogHostTraceEventNode(
   if (!output_file_stream_) {
     return;
   }
-  output_file_stream_ << string_format(
-      std::string(
-          R"JSON(
+  switch (host_node.Type()) {
+    case TracerEventType::ProfileStep:
+    case TracerEventType::Forward:
+    case TracerEventType::Backward:
+    case TracerEventType::Dataloader:
+    case TracerEventType::Optimization:
+    case TracerEventType::PythonOp:
+    case TracerEventType::PythonUserDefined:
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
+    "name": "%s", "pid": %lld, "tid": "%lld(Python)",
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
-      
+      "start_ns": %lld,
+      "end_ns": %lld
     }
   },
   )JSON"),
-      host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
-      nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
-      categary_name_[static_cast<int>(host_node.Type())]);
+          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          categary_name_[static_cast<int>(host_node.Type())],
+          host_node.StartNs(), host_node.EndNs());
+      break;
+    default:
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
+  { 
+    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, "dur": %lld,
+    "ph": "X", "cat": "%s", 
+    "args": {
+      "start_ns": %lld,
+      "end_ns": %lld
+    }
+  },
+  )JSON"),
+          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          categary_name_[static_cast<int>(host_node.Type())],
+          host_node.StartNs(), host_node.EndNs());
+      break;
+  }
+
+  pid_tid_set_.insert({host_node.ProcessId(), host_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogRuntimeTraceEventNode(
@@ -148,11 +161,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       std::string(
           R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
+    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
-      "correlation id": %d
+      "correlation id": %d,
+      "start_ns": %lld,
+      "end_ns": %lld
     }
   },
   )JSON"),
@@ -160,7 +175,23 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()),
       nsToUs(runtime_node.Duration()),
       categary_name_[static_cast<int>(runtime_node.Type())],
-      runtime_node.CorrelationId());
+      runtime_node.CorrelationId(), runtime_node.StartNs(),
+      runtime_node.EndNs());
+  pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
+
+  output_file_stream_ << string_format(
+      std::string(
+          R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, 
+    "ph": "s", "cat": "async"
+  },
+  )JSON"),
+      runtime_node.CorrelationId(), runtime_node.ProcessId(),
+      runtime_node.ThreadId(),
+      nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1));
+  pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogDeviceTraceEventNode(
@@ -180,6 +211,36 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
     default:
       break;
   }
+  if (nsToUs(device_node.Duration()) == 0) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
+    "ts": %lld, 
+    "ph": "f", "cat": "async"
+  },
+  )JSON"),
+        device_node.CorrelationId(), device_node.DeviceId(),
+        device_node.StreamId(), nsToUs(device_node.StartNs()));
+    deviceid_streamid_set_.insert(
+        {device_node.DeviceId(), device_node.StreamId()});
+  } else {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
+    "ts": %lld, 
+    "ph": "f", "cat": "async", "bp": "e"
+  },
+  )JSON"),
+        device_node.CorrelationId(), device_node.DeviceId(),
+        device_node.StreamId(),
+        nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1));
+    deviceid_streamid_set_.insert(
+        {device_node.DeviceId(), device_node.StreamId()});
+  }
 }
 
 void ChromeTracingLogger::HandleTypeKernel(
@@ -188,16 +249,21 @@ void ChromeTracingLogger::HandleTypeKernel(
   float blocks_per_sm = 0.0;
   float warps_per_sm = 0.0;
   float occupancy = 0.0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUPTI)
   constexpr int threads_per_warp = 32;
   const gpuDeviceProp& device_property =
       GetDeviceProperties(device_node.DeviceId());
-  blocks_per_sm =
-      (kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) /
-      device_property.multiProcessorCount;
+  blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
+                                     kernel_info.grid_z) /
+                  device_property.multiProcessorCount;
   warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
                                   kernel_info.block_z) /
                  threads_per_warp;
+  occupancy = CalculateEstOccupancy(
+      device_node.DeviceId(), kernel_info.registers_per_thread,
+      kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
+      kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
+      blocks_per_sm);
 #endif
 
   output_file_stream_ << string_format(
@@ -208,15 +274,17 @@ void ChromeTracingLogger::HandleTypeKernel(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "registers per thread": %d,
-      "shared memory": %f,
+      "shared memory": %d,
       "blocks per SM": %f,
       "warps per SM": %f,
       "grid": [%d, %d, %d],
       "block": [%d, %d, %d],
-      "est. achieved occupancy %": %f
+      "theoretical achieved occupancy %%": %f
     }
   },
   )JSON"),
@@ -224,12 +292,13 @@ void ChromeTracingLogger::HandleTypeKernel(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
+      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), kernel_info.registers_per_thread,
       kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
       blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y,
       kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y,
-      kernel_info.block_z, occupancy);
+      kernel_info.block_z, occupancy * 100);
 }
 
 void ChromeTracingLogger::HandleTypeMemcpy(
@@ -247,6 +316,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "memory bandwidth (GB/s)": %f
     }
@@ -256,8 +327,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.StreamId(), device_node.CorrelationId(),
-      memcpy_info.num_bytes, memory_bandwidth);
+      device_node.StartNs(), device_node.EndNs(), device_node.StreamId(),
+      device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
 }
 
 void ChromeTracingLogger::HandleTypeMemset(
@@ -271,6 +342,8 @@ void ChromeTracingLogger::HandleTypeMemset(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "value": %d
@@ -281,7 +354,8 @@ void ChromeTracingLogger::HandleTypeMemset(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
+      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
 }
 
@@ -290,10 +364,10 @@ void ChromeTracingLogger::StartLog() {
                                            R"JSON(
   { 
     "schemaVersion": "%s",
-    "displayTimeUnit": "us",
-    "SpanNumber": "%d",
+    "displayTimeUnit": "ms",
+    "span_indx": "%d",
   )JSON"),
-                                       kSchemaVersion, num_span);
+                                       kSchemaVersion, span_indx++);
 // add device property information
 #if defined(PADDLE_WITH_CUDA)
   output_file_stream_ << std::string(R"JSON(
@@ -358,11 +432,143 @@ void ChromeTracingLogger::StartLog() {
   )JSON");
 }
 
-void ChromeTracingLogger::EndLog() {
+void ChromeTracingLogger::LogMetaInfo(
+    const std::unordered_map<std::string, std::string> extra_info) {
+  RefineDisplayName(extra_info);
   output_file_stream_ << std::string(
       R"JSON(
   {}
-  ]
+  ],
+  )JSON");
+  output_file_stream_ << std::string(R"JSON(
+  "ExtraInfo": {)JSON");
+  size_t count = extra_info.size();
+  for (const auto& kv : extra_info) {
+    if (count > 1) {
+      output_file_stream_ << string_format(std::string(R"JSON(
+     "%s": "%s",
+   )JSON"),
+                                           kv.first.c_str(), kv.second.c_str());
+    } else {
+      output_file_stream_ << string_format(std::string(R"JSON(
+     "%s": "%s"
+   )JSON"),
+                                           kv.first.c_str(), kv.second.c_str());
+    }
+    count--;
+  }
+  output_file_stream_ << std::string(R"JSON(
+  })JSON");
+}
+
+void ChromeTracingLogger::RefineDisplayName(
+    std::unordered_map<std::string, std::string> extra_info) {
+  for (auto it = pid_tid_set_.begin(); it != pid_tid_set_.end(); ++it) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  {
+    "name": "process_name", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "name": "Process %lld (CPU)"
+    }
+  },
+  {
+    "name": "process_name", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "name": "Process %lld (CPU)"
+    }
+  },
+   {
+    "name": "thread_name", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "name": "thread %lld:%s(Python)"
+    }
+  },
+  {
+    "name": "thread_name", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "name": "thread %lld:%s(C++)"
+    }
+  },
+  {
+    "name": "process_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },
+  )JSON"),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).first, (*it).first, (*it).second, (*it).second,
+        extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
+        (*it).first, (*it).second, (*it).second,
+        extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
+  }
+
+  for (auto it = deviceid_streamid_set_.begin();
+       it != deviceid_streamid_set_.end(); ++it) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  {
+    "name": "process_name", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "name": "Deivce %lld (GPU)"
+    }
+  },
+   {
+    "name": "thread_name", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "name": "stream %lld"
+    }
+  },
+  {
+    "name": "process_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  )JSON"),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
+        (*it).first, (*it).second, (*it).second);
+  }
+}
+
+void ChromeTracingLogger::EndLog() {
+  output_file_stream_ << std::string(
+      R"JSON(
   }
   )JSON");
 }
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 06734418609..20a924a54ca 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -13,11 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <set>
+#include <unordered_map>
+#include <utility>
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
 namespace paddle {
 namespace platform {
 
+// Dump a NodeTrees into a chrome tracing file.
+// A ChromeTracingLogger object can only dump a NodeTrees object,
+// creates a file in the constructor and closes the file in the destructor.
+// should only call LogNodeTrees and LogMetaInfo in order.
 class ChromeTracingLogger : public BaseLogger {
  public:
   explicit ChromeTracingLogger(const std::string& filename);
@@ -28,6 +35,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogHostTraceEventNode(const HostTraceEventNode&) override;
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
+  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
 
  private:
   void OpenFile();
@@ -36,9 +44,12 @@ class ChromeTracingLogger : public BaseLogger {
   void HandleTypeMemcpy(const DeviceTraceEventNode&);
   void StartLog();
   void EndLog();
+  void RefineDisplayName(std::unordered_map<std::string, std::string>);
   std::string filename_;
   std::ofstream output_file_stream_;
   static const char* categary_name_[];
+  std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
+  std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index 672a9a15453..ce2e49a1ccd 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -54,19 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    while (true) {
-      int retval = fscanf(
-          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-          temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
-          &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
-          &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
-      if (std::string(temp_str).find("cpu") != 0) {
-        break;
-      }
-      if (retval != 11) {
-        return;
-      }
+    int retval = fscanf(
+        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+        temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+        &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_,
+        &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+    if (retval != 11) {
+      LOG(WARNING)
+          << "Failed to read cpu utilization information at record beginning."
+          << std::endl;
     }
     fclose(stat_file);
   }
@@ -90,19 +87,17 @@ void CpuUtilization::RecordEndTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    while (true) {
-      int retval = fscanf(
-          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-          temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
-          &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
-          &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
-      if (std::string(temp_str).find("cpu") != 0) {
-        break;
-      }
-      if (retval != 11) {
-        return;
-      }
+    int retval = fscanf(
+        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+        temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+        &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+        &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+
+    if (retval != 11) {
+      LOG(WARNING)
+          << "Failed to read cpu utilization information at record end."
+          << std::endl;
     }
     fclose(stat_file);
   }
diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
index e25333f7a8a..5045c56afbc 100644
--- a/paddle/fluid/platform/profiler/dump/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
@@ -1,4 +1 @@
 proto_library(nodetreeproto SRCS nodetree.proto)
-cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
-cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
-cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS  serialization_logger deserialization_reader event_node)
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index d1049a7dc19..de3411579d3 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
-
 #include <cstring>
+#include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
 namespace platform {
@@ -36,11 +36,19 @@ void DeserializationReader::OpenFile() {
   }
 }
 
-std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
+std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
     VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
     return nullptr;
   }
+  // restore extra info
+  ExtraInfo extrainfo;
+  for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) {
+    ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx);
+    extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"),
+                           extra_info_map.value().c_str());
+  }
+  // restore NodeTrees
   std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
   for (int node_tree_index = 0;
        node_tree_index < node_trees_proto_->thread_trees_size();
@@ -95,7 +103,9 @@ std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
     }
   }
   // restore NodeTrees object
-  return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
+  std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
+  return std::unique_ptr<ProfilerResult>(
+      new ProfilerResult(std::move(tree), extrainfo));
 }
 
 DeserializationReader::~DeserializationReader() {
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index 1ad2dabf229..e6feb4f9489 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
-#include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 
 namespace paddle {
 namespace platform {
@@ -24,7 +24,7 @@ class DeserializationReader {
   explicit DeserializationReader(const std::string& filename);
   explicit DeserializationReader(const char* filename);
   ~DeserializationReader();
-  std::unique_ptr<NodeTrees> Parse();
+  std::unique_ptr<ProfilerResult> Parse();
 
  private:
   void OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
index 37dac0e597c..7016745059d 100644
--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -32,9 +32,21 @@ enum TracerEventTypeProto {
   Memset = 6;
   // Used to mark record defined by user
   UserDefined = 7;
-  // A flag to denote the number of current types
-  NumTypes = 8;
-}
+  // Used to mark operator detail, (such as infer shape, compute)
+  OperatorInner = 8;
+  // Used to mark model training or testing perspective, forward process
+  Forward = 9;
+  // Used to mark model training perspective, backward process
+  Backward = 10;
+  // Used to mark model training perspective, optimization process
+  Optimization = 11;
+  // Used to mark distributed training perspective
+  Communication = 12;
+  // Used to mark python api
+  PythonOp = 13;
+  // Used to mark python level userdefined
+  PythonUserDefined = 14;
+};
 
 message KernelEventInfoProto {
   // The X-dimension block size for the kernel.
@@ -175,7 +187,14 @@ message ThreadNodeTreeProto {
   repeated HostTraceEventNodeProto host_nodes = 2;
 }
 
+message ExtraInfoMap {
+  required string key = 1;
+  required string value = 2;
+}
+
 message NodeTreesProto {
   required string version = 1;
-  repeated ThreadNodeTreeProto thread_trees = 2;
+  required uint32 span_indx = 2;
+  repeated ThreadNodeTreeProto thread_trees = 3;
+  repeated ExtraInfoMap extra_info = 4;
 }
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index d9ed84bd438..73021f4362a 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
@@ -20,6 +21,7 @@ namespace platform {
 
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
 static const char* version = "1.0.0";
+static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
@@ -39,6 +41,7 @@ void SerializationLogger::OpenFile() {
   }
   node_trees_proto_ = new NodeTreesProto();
   node_trees_proto_->set_version(std::string(version));
+  node_trees_proto_->set_span_indx(span_indx++);
 }
 
 void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
@@ -240,6 +243,15 @@ void SerializationLogger::HandleTypeMemset(
       device_trace_event);
 }
 
+void SerializationLogger::LogMetaInfo(
+    const std::unordered_map<std::string, std::string> extra_info) {
+  for (const auto& kv : extra_info) {
+    ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
+    extra_info_map->set_key(kv.first);
+    extra_info_map->set_value(kv.second);
+  }
+}
+
 SerializationLogger::SerializationLogger(const std::string& filename) {
   filename_ = filename.empty() ? DefaultFileName() : filename;
   OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
old mode 100755
new mode 100644
index 1295be95d45..378834cff59
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -11,6 +11,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
+
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
@@ -20,6 +22,7 @@ namespace platform {
 // Dump a NodeTrees into a profobuf file.
 // A SerializationLogger object can only dump a NodeTrees object,
 // creates a file in the constructor and closes the file in the destructor.
+// Should only call LogNodeTrees and LogMetaInfo.
 class SerializationLogger : public BaseLogger {
  public:
   explicit SerializationLogger(const std::string& filename);
@@ -30,12 +33,14 @@ class SerializationLogger : public BaseLogger {
   void LogHostTraceEventNode(const HostTraceEventNode&) override;
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
+  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
 
  private:
   void OpenFile();
   void HandleTypeKernel(const DeviceTraceEventNode&);
   void HandleTypeMemset(const DeviceTraceEventNode&);
   void HandleTypeMemcpy(const DeviceTraceEventNode&);
+
   std::string filename_;
   std::ofstream output_file_stream_;
   NodeTreesProto* node_trees_proto_;
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index 2fe9626ec76..dee1019da2b 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 
 using paddle::platform::SerializationLogger;
 using paddle::platform::DeserializationReader;
@@ -31,6 +32,7 @@ using paddle::platform::TracerEventType;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::ProfilerResult;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
@@ -149,7 +151,8 @@ TEST(SerializationLoggerTest, dump_case1) {
 
 TEST(DeserializationReaderTest, restore_case0) {
   DeserializationReader reader("test_serialization_logger_case0.pb");
-  std::unique_ptr<NodeTrees> tree = reader.Parse();
+  auto profiler_result = reader.Parse();
+  auto& tree = profiler_result->GetNodeTrees();
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree->Traverse(true);
   EXPECT_EQ(nodes[10].size(), 4u);
@@ -172,3 +175,26 @@ TEST(DeserializationReaderTest, restore_case0) {
     }
   }
 }
+
+TEST(DeserializationReaderTest, restore_case1) {
+  DeserializationReader reader("test_serialization_logger_case1.pb");
+  auto profiler_result = reader.Parse();
+  auto& tree = profiler_result->GetNodeTrees();
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree->Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 1u);
+  EXPECT_EQ(nodes[11].size(), 1u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+}
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
new file mode 100644
index 00000000000..1a6f19d2f93
--- /dev/null
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
+
+namespace paddle {
+namespace platform {
+
+HostPythonNode::~HostPythonNode() {
+  // delete all runtime or device nodes and recursive delete children
+  for (auto it = children_node_ptrs.begin(); it != children_node_ptrs.end();
+       ++it) {
+    delete *it;
+  }
+  for (auto it = runtime_node_ptrs.begin(); it != runtime_node_ptrs.end();
+       ++it) {
+    delete *it;
+  }
+  for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) {
+    delete *it;
+  }
+}
+
+HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
+  // Copy and transfer EventNode in NodeTree to PythonNode
+  if (root == nullptr) {
+    return nullptr;
+  }
+  // copy HostTraceEventNode and its children
+  HostPythonNode* host_python_node = new HostPythonNode();
+  host_python_node->name = root->Name();
+  host_python_node->type = root->Type();
+  host_python_node->start_ns = root->StartNs();
+  host_python_node->end_ns = root->EndNs();
+  host_python_node->process_id = root->ProcessId();
+  host_python_node->thread_id = root->ThreadId();
+  for (auto it = root->GetChildren().begin(); it != root->GetChildren().end();
+       ++it) {
+    host_python_node->children_node_ptrs.push_back(CopyTree(*it));
+  }
+  // copy its CudaRuntimeTraceEventNode
+  for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin();
+       runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) {
+    HostPythonNode* runtime_python_node = new HostPythonNode();
+    runtime_python_node->name = (*runtimenode)->Name();
+    runtime_python_node->type = (*runtimenode)->Type();
+    runtime_python_node->start_ns = (*runtimenode)->StartNs();
+    runtime_python_node->end_ns = (*runtimenode)->EndNs();
+    runtime_python_node->process_id = (*runtimenode)->ProcessId();
+    runtime_python_node->thread_id = (*runtimenode)->ThreadId();
+    host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
+    // copy DeviceTraceEventNode
+    for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
+         devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
+         ++devicenode) {
+      DevicePythonNode* device_python_node = new DevicePythonNode();
+      device_python_node->name = (*devicenode)->Name();
+      device_python_node->type = (*devicenode)->Type();
+      device_python_node->start_ns = (*devicenode)->StartNs();
+      device_python_node->end_ns = (*devicenode)->EndNs();
+      device_python_node->device_id = (*devicenode)->DeviceId();
+      device_python_node->context_id = (*devicenode)->ContextId();
+      device_python_node->stream_id = (*devicenode)->StreamId();
+      runtime_python_node->device_node_ptrs.push_back(device_python_node);
+    }
+  }
+  return host_python_node;
+}
+
+ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
+                               const ExtraInfo& extra_info)
+    : tree_(std::move(tree)), extra_info_(extra_info) {
+  if (tree_ != nullptr) {
+    std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
+    for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
+      thread_event_trees_map_[it->first] = CopyTree(it->second);
+    }
+  }
+}
+
+ProfilerResult::~ProfilerResult() {
+  // delete all root nodes
+  for (auto it = thread_event_trees_map_.begin();
+       it != thread_event_trees_map_.end(); ++it) {
+    delete it->second;
+  }
+}
+
+void ProfilerResult::Save(const std::string& file_name,
+                          const std::string format) {
+  if (format == std::string("json")) {
+    ChromeTracingLogger logger(file_name);
+    tree_->LogMe(&logger);
+    logger.LogMetaInfo(GetExtraInfo());
+  } else if (format == std::string("pb")) {
+    SerializationLogger logger(file_name);
+    tree_->LogMe(&logger);
+    logger.LogMetaInfo(GetExtraInfo());
+  }
+  return;
+}
+
+std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename) {
+  DeserializationReader reader(filename);
+  std::unique_ptr<ProfilerResult> result = reader.Parse();
+  return result;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index b0d8eaa2427..12ecb9fde32 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -15,8 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
+#include <unordered_map>
 
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
 namespace platform {
@@ -66,18 +69,29 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-  explicit ProfilerResult(NodeTrees* tree);
+  explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
+                          const ExtraInfo& extra_info);
   ~ProfilerResult();
   std::map<uint64_t, HostPythonNode*> GetData() {
-    return thread_event_trees_map;
+    return thread_event_trees_map_;
   }
-  void Save(const std::string& file_name);
+  std::unordered_map<std::string, std::string> GetExtraInfo() {
+    return extra_info_.GetExtraInfo();
+  }
+
+  void Save(const std::string& file_name,
+            const std::string format = std::string("json"));
+
+  std::unique_ptr<NodeTrees>& GetNodeTrees() { return tree_; }
 
  private:
-  std::map<uint64_t, HostPythonNode*> thread_event_trees_map;
-  NodeTrees* tree_;
-  HostPythonNode* CopyTree(HostTraceEventNode* node);
+  std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
+  std::unique_ptr<NodeTrees> tree_;
+  ExtraInfo extra_info_;
+  HostPythonNode* CopyTree(HostTraceEventNode* root);
 };
 
+std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename);
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 5784d6e671b..35dbc96874d 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -25,8 +25,10 @@
 #endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
+#include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
 namespace platform {
@@ -44,10 +46,15 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
 
 Profiler::Profiler(const ProfilerOptions& options) {
   options_ = options;
-  HostTracerOptions host_tracer_options;
-  host_tracer_options.trace_level = options.trace_level;
-  tracers_.emplace_back(new HostTracer(host_tracer_options), true);
-  tracers_.emplace_back(&CudaTracer::GetInstance(), false);
+  std::bitset<32> trace_switch(options_.trace_switch);
+  if (trace_switch.test(kProfileCPUOptionBit)) {
+    HostTracerOptions host_tracer_options;
+    host_tracer_options.trace_level = options_.trace_level;
+    tracers_.emplace_back(new HostTracer(host_tracer_options), true);
+  }
+  if (trace_switch.test(kProfileGPUOptionBit)) {
+    tracers_.emplace_back(&CudaTracer::GetInstance(), false);
+  }
 }
 
 Profiler::~Profiler() { alive_.store(false); }
@@ -63,9 +70,10 @@ void Profiler::Start() {
   for (auto& tracer : tracers_) {
     tracer.Get().StartTracing();
   }
+  cpu_utilization_.RecordBeginTimeInfo();
 }
 
-std::unique_ptr<NodeTrees> Profiler::Stop() {
+std::unique_ptr<ProfilerResult> Profiler::Stop() {
   SynchronizeAllDevice();
   TraceEventCollector collector;
   for (auto& tracer : tracers_) {
@@ -75,7 +83,22 @@ std::unique_ptr<NodeTrees> Profiler::Stop() {
   std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
                                                 collector.RuntimeEvents(),
                                                 collector.DeviceEvents()));
-  return tree;
+  cpu_utilization_.RecordEndTimeInfo();
+  ExtraInfo extrainfo;
+  extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
+                         std::string("%f"),
+                         cpu_utilization_.GetCpuUtilization());
+  extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"),
+                         std::string("%f"),
+                         cpu_utilization_.GetCpuCurProcessUtilization());
+  const std::unordered_map<uint64_t, std::string> thread_names =
+      collector.ThreadNames();
+  for (const auto& kv : thread_names) {
+    extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
+                           kv.second);
+  }
+  return std::unique_ptr<ProfilerResult>(
+      new platform::ProfilerResult(std::move(tree), extrainfo));
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index 4fc1c6daf96..f9a8ece0504 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -15,12 +15,15 @@
 #pragma once
 
 #include <atomic>
+#include <bitset>
 #include <cstdint>
 #include <functional>
 #include <list>
 #include <memory>
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/profiler/cpu_utilization.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
 
 DECLARE_int64(host_trace_level);
@@ -28,7 +31,11 @@ DECLARE_int64(host_trace_level);
 namespace paddle {
 namespace platform {
 
+static constexpr uint32_t kProfileCPUOptionBit = 0;
+static constexpr uint32_t kProfileGPUOptionBit = 1;
+
 struct ProfilerOptions {
+  uint32_t trace_switch = 0;  // bit 0: cpu, bit 1: gpu
   uint32_t trace_level = FLAGS_host_trace_level;
 };
 
@@ -40,7 +47,7 @@ class Profiler {
 
   void Start();
 
-  std::unique_ptr<NodeTrees> Stop();
+  std::unique_ptr<ProfilerResult> Stop();
 
   ~Profiler();
 
@@ -70,6 +77,7 @@ class Profiler {
   ProfilerOptions options_;
   uint64_t start_ns_ = UINT64_MAX;
   std::list<TracerHolder> tracers_;
+  CpuUtilization cpu_utilization_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index 160c801dc6e..32310b9e862 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -22,6 +22,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
 
@@ -30,8 +31,10 @@ TEST(ProfilerTest, TestHostTracer) {
   using paddle::platform::Profiler;
   using paddle::platform::RecordInstantEvent;
   using paddle::platform::TracerEventType;
+  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 2;
+  options.trace_switch = 3;
   auto profiler = Profiler::Create(options);
   EXPECT_TRUE(profiler);
   profiler->Prepare();
@@ -42,7 +45,8 @@ TEST(ProfilerTest, TestHostTracer) {
     RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
                        3);
   }
-  auto nodetree = profiler->Stop();
+  auto profiler_result = profiler->Stop();
+  auto& nodetree = profiler_result->GetNodeTrees();
   std::set<std::string> host_events;
   for (const auto pair : nodetree->Traverse(true)) {
     for (const auto evt : pair.second) {
@@ -56,8 +60,10 @@ TEST(ProfilerTest, TestHostTracer) {
 TEST(ProfilerTest, TestCudaTracer) {
   using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 0;
+  options.trace_switch = 3;
   auto profiler = Profiler::Create(options);
   EXPECT_TRUE(profiler);
   profiler->Prepare();
@@ -72,7 +78,8 @@ TEST(ProfilerTest, TestCudaTracer) {
   hipStreamCreate(&stream);
   hipStreamSynchronize(stream);
 #endif
-  auto nodetree = profiler->Stop();
+  auto profiler_result = profiler->Stop();
+  auto& nodetree = profiler_result->GetNodeTrees();
   std::vector<std::string> runtime_events;
   for (const auto pair : nodetree->Traverse(true)) {
     for (const auto host_node : pair.second) {
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index 61f96218560..16ef62fb515 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -48,6 +48,8 @@ enum class TracerEventType {
   Communication = 12,
   // Used to mark python api
   PythonOp = 13,
+  // Used to mark python level userdefined
+  PythonUserDefined = 14,
   // A flag to denote the number of current types
   NumTypes
 };
-- 
GitLab


From 1db188f318ae0b0292984e08afd626898e3170da Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 2 Mar 2022 15:37:29 +0800
Subject: [PATCH 051/272] [IPU] update ipu unittests  p0 (#39707)

* update ipu UTs part0

* rename UT

* sync api changes

* update uts for new api

* use_ipumodel() as classmethod
---
 .../tests/unittests/ipu/ernie_training.py     | 934 ------------------
 .../fluid/tests/unittests/ipu/op_test_ipu.py  |  73 +-
 .../unittests/ipu/test_activation_x_op_ipu.py | 133 +++
 .../unittests/ipu/test_arg_max_op_ipu.py      | 117 +++
 .../tests/unittests/ipu/test_assign_op_ipu.py | 102 ++
 .../tests/unittests/ipu/test_avg_shard_ipu.py | 112 ++-
 .../unittests/ipu/test_batch_norm_op_ipu.py   | 108 +-
 ....py => test_batchs_per_step_simple_ipu.py} |  22 +-
 .../tests/unittests/ipu/test_cast_op_ipu.py   | 111 ++-
 .../tests/unittests/ipu/test_concat_op_ipu.py |  93 +-
 .../tests/unittests/ipu/test_conv_op_ipu.py   | 127 +--
 .../ipu/test_cross_entropy2_op_ipu.py         | 128 ++-
 .../tests/unittests/ipu/test_cumsum_op_ipu.py | 123 +++
 13 files changed, 950 insertions(+), 1233 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/ernie_training.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_batchs_per_step_simple.py => test_batchs_per_step_simple_ipu.py} (79%)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py

diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
deleted file mode 100644
index ddda666db2c..00000000000
--- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
+++ /dev/null
@@ -1,934 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# refrenece : https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/ernie
-
-import os
-import copy
-import argparse
-from contextlib import contextmanager
-from functools import partial
-
-import numpy as np
-import paddle
-import paddle.static
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.compiler as compiler
-paddle.enable_static()
-
-SEED = 2021
-INT_DTYPE = None
-
-# ernie related block 
-ernie_config = {
-    "emb_size": 128,
-    "emb_mapping_in": False,
-    "hidden_size": 192,
-    "num_hidden_layers": 2,
-    "n_layer_per_block": 2,
-    "num_attention_heads": 12,
-    "vocab_size": 300,
-    "max_position_embeddings": 512,
-    "sent_type_vocab_size": 4,
-    "task_type_vocab_size": 16,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "attention_probs_dropout_prob": 0.0,
-    "preln": False,
-    "pre_encoder_cmd": "n",
-    "preprocess_cmd": "",
-    "postprocess_cmd": "an",
-    "epsilon": 1e-12,
-    "initializer_range": 0.02,
-    "seq_len": 32
-}
-
-
-def gelu(x):
-    """Gaussian Error Linear Unit.
-
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-      x: float Tensor to perform activation.
-
-    Returns:
-      `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + fluid.layers.tanh(
-        (np.sqrt(2.0 / np.pi) * (x + 0.044715 * fluid.layers.pow(x, 3.0)))))
-    return x * cdf
-
-
-def pre_post_process_layer(prev_out,
-                           out,
-                           process_cmd,
-                           dropout_rate=0.,
-                           epsilon=1e-12,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_scale',
-                    initializer=fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_bias',
-                    initializer=fluid.initializer.Constant(0.)),
-                epsilon=epsilon)
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    dropout_implementation="upscale_in_train",
-                    is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-
-    #assert hidden_act == 'gelu.approximate'
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=None,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    hidden = gelu(hidden)
-
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test=False)
-
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(
-                        name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-
-    return out
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_query_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_key_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_value_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=False)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=False)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(
-                             name=name + '_output_fc.w_0',
-                             initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-
-    return proj_out
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name='',
-                  epsilon=1e-12):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-
-    attn_output = multi_head_attention(
-        enc_input,
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer=param_initializer,
-        name=name + '_multi_head_att')
-
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        'an',
-        prepostprocess_dropout,
-        name=name + '_post_att',
-        epsilon=epsilon)
-
-    ffd_output = positionwise_feed_forward(
-        attn_output,
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer=param_initializer,
-        name=name + '_ffn')
-
-    post_output = post_process_layer(
-        attn_output,
-        ffd_output,
-        'an',
-        prepostprocess_dropout,
-        name=name + '_post_ffn',
-        epsilon=epsilon)
-
-    return post_output
-
-
-def encoder_inner_share(enc_input,
-                        attn_bias,
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        hidden_act,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        epsilon,
-                        param_initializer=None,
-                        name='',
-                        n_layer_per_block=1):
-    """
-       The encoder_inner_share is composed of n_layer_per_block layers returned by calling
-       encoder_layer.
-    """
-
-    for i in range(n_layer_per_block):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer=param_initializer,
-            name=name + '_layer_' + str(i),
-            epsilon=epsilon)
-
-        enc_input = enc_output
-
-    return enc_output
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            epsilon,
-            n_layer_per_block,
-            param_initializer=None,
-            name='',
-            preln=False):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer .
-    """
-
-    for _ in range(n_layer // n_layer_per_block):
-        attn_bias.stop_gradient = True
-        attn_bias.persistable = False
-        enc_output = encoder_inner_share(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            epsilon,
-            param_initializer=param_initializer,
-            name=name,
-            n_layer_per_block=n_layer_per_block)
-
-        enc_input = enc_output
-
-    if preln:
-        enc_output = post_process_layer(
-            None,
-            enc_output,
-            'n',
-            prepostprocess_dropout,
-            name='post_encoder',
-            epsilon=epsilon)
-
-    enc_output = pre_process_layer(
-        enc_output,
-        preprocess_cmd,
-        prepostprocess_dropout,
-        name="post_encoder",
-        epsilon=epsilon)
-
-    return enc_output
-
-
-class ErnieModel(object):
-    def __init__(self, src_ids, sent_ids, pos_ids, input_mask, config):
-
-        self._emb_size = config['emb_size'] if config[
-            'emb_mapping_in'] else config['hidden_size']
-        self._hidden_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['sent_type_vocab_size']
-        self._task_types = config['task_type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self.config = config
-        self.preln = config['preln'] if 'preln' in config.keys() else False
-        self.pre_encoder_cmd = "" if self.preln else self.config[
-            'pre_encoder_cmd']
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._task_emb_name = "task_embedding"
-        self._dtype = "float32"
-        self._emb_dtype = "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
-
-        self.src_ids = src_ids
-        self.sent_ids = sent_ids
-        self.pos_ids = pos_ids
-        self.input_mask = input_mask
-        '''
-        _build_position_ids: range op doesn't support
-        _build_input_mask: logic_not op doesn't support
-        '''
-
-        self._build_model()
-
-    def _build_model(self, emb=None):
-        with fluid.ipu_shard(ipu_index=0, ipu_stage=0):
-            # padding id in vocabulary must be set to 0
-            self.emb_out = fluid.layers.embedding(
-                input=self.src_ids,
-                size=[self._voc_size, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._word_emb_name,
-                    initializer=self._param_initializer),
-                is_sparse=False)
-
-            self.position_emb_out = fluid.layers.embedding(
-                input=self.pos_ids,
-                size=[self._max_position_seq_len, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._pos_emb_name,
-                    initializer=self._param_initializer))
-
-            self.sent_emb_out = fluid.layers.embedding(
-                self.sent_ids,
-                size=[self._sent_types, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._sent_emb_name,
-                    initializer=self._param_initializer))
-
-            sum_emb = self.emb_out + self.position_emb_out + self.sent_emb_out
-
-            sum_emb = pre_process_layer(
-                sum_emb,
-                self.config['pre_encoder_cmd'],
-                self._prepostprocess_dropout,
-                name='pre_encoder',
-                epsilon=self.config['epsilon'])
-
-            if self.config['emb_mapping_in']:
-                sum_emb = fluid.layers.fc(
-                    input=sum_emb,
-                    num_flatten_dims=2,
-                    size=self._hidden_size,
-                    param_attr=fluid.ParamAttr(
-                        name='emb_hidden_mapping',
-                        initializer=self._param_initializer),
-                    bias_attr='emb_hidden_mapping_bias')
-
-            self_attn_mask = fluid.layers.matmul(
-                x=self.input_mask, y=self.input_mask, transpose_y=True)
-
-            self_attn_mask = fluid.layers.scale(
-                x=self_attn_mask,
-                scale=10000.0,
-                bias=-1.0,
-                bias_after_scale=False)
-
-        with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-            n_head_self_attn_mask = fluid.layers.stack(
-                x=[self_attn_mask] * self._n_head,
-                axis=1)  # [bs, _n_head, seqlen, seq_len]
-            n_head_self_attn_mask.stop_gradient = True
-
-            self._enc_out = encoder(
-                enc_input=sum_emb,
-                attn_bias=n_head_self_attn_mask,
-                n_layer=self._n_layer,
-                n_head=self._n_head,
-                d_key=self._hidden_size // self._n_head,
-                d_value=self._hidden_size // self._n_head,
-                d_model=self._hidden_size,
-                d_inner_hid=self._hidden_size * 4,
-                prepostprocess_dropout=self._prepostprocess_dropout,
-                attention_dropout=self._attention_dropout,
-                relu_dropout=0,
-                hidden_act=self._hidden_act,
-                preprocess_cmd=self.config['preprocess_cmd'],
-                postprocess_cmd=self.config['postprocess_cmd'],
-                param_initializer=self._param_initializer,
-                name='encoder',
-                epsilon=self.config['epsilon'],
-                n_layer_per_block=self.config['n_layer_per_block'],
-                preln=self.preln)
-
-    def _build_position_ids(self):
-        d_shape = fluid.layers.shape(self.src_ids)
-        d_seqlen = d_shape[1]
-        d_batch = d_shape[0]
-        position_ids = fluid.layers.reshape(
-            fluid.layers.range(
-                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
-            inplace=False)
-        position_ids = fluid.layers.expand(position_ids, [d_batch, 1, 1])
-        position_ids = fluid.layers.cast(position_ids, INT_DTYPE)
-        position_ids.stop_gradient = True
-        return position_ids
-
-    def _build_input_mask(self):
-        zero = fluid.layers.fill_constant([1], dtype=INT_DTYPE, value=0)
-        input_mask = fluid.layers.logical_not(
-            fluid.layers.equal(self.src_ids, zero))  # assume pad id == 0
-        input_mask = fluid.layers.cast(input_mask, 'float32')
-        input_mask.stop_gradient = True
-        return input_mask
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-        next_sent_feat = fluid.layers.slice(
-            input=self._enc_out, axes=[1], starts=[0], ends=[1])
-
-        next_sent_feat = fluid.layers.fc(
-            input=next_sent_feat,
-            size=self._hidden_size,
-            act="tanh",
-            param_attr=fluid.ParamAttr(
-                name="pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_next_sentence_output(self, labels):
-        next_sent_feat = self.get_pooled_output()
-        next_sent_fc_out = fluid.layers.fc(
-            input=next_sent_feat,
-            num_flatten_dims=1,
-            size=33,
-            param_attr=fluid.ParamAttr(
-                name="next_sent_fc.w_0", initializer=self._param_initializer),
-            bias_attr="next_sent_fc.b_0")
-        next_sent_fc_out = fluid.layers.reshape(
-            next_sent_fc_out, [-1, 33], inplace=False)
-        #next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
-        #    logits=next_sent_fc_out, label=labels, return_softmax=True)
-        next_sent_softmax = fluid.layers.softmax(next_sent_fc_out)
-        next_sent_loss = fluid.layers.cross_entropy(next_sent_softmax, labels)
-        next_sent_acc = fluid.layers.accuracy(
-            input=next_sent_softmax, label=labels)
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss,
-                                                "mean_next_sent_loss")
-        return next_sent_acc, mean_next_sent_loss
-
-    def get_lm_output(self, mask_label, mask_pos):
-        """Get the loss & accuracy for pretraining"""
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        reshaped_emb_out = fluid.layers.reshape(
-            x=self._enc_out, shape=[-1, self._hidden_size])
-
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-        if self._dtype == "float16":
-            mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype)
-
-        # transform: fc
-        if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise':
-            _hidden_act = 'gelu'
-        else:
-            _hidden_act = None
-
-        mask_trans_feat = fluid.layers.fc(
-            input=mask_feat,
-            size=self._emb_size,
-            act=_hidden_act,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_fc.w_0',
-                initializer=self._param_initializer),
-            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-
-        if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise':
-            pass
-        else:
-            mask_trans_feat = gelu(mask_trans_feat)
-
-        # transform: layer norm
-        mask_trans_feat = fluid.layers.layer_norm(
-            mask_trans_feat,
-            begin_norm_axis=len(mask_trans_feat.shape) - 1,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_layer_norm_scale',
-                initializer=fluid.initializer.Constant(1.)),
-            bias_attr=fluid.ParamAttr(
-                name='mask_lm_trans_layer_norm_bias',
-                initializer=fluid.initializer.Constant(0.)),
-            epsilon=self.config['epsilon'])
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(
-            name="mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0))
-
-        fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                 size=self._voc_size,
-                                 param_attr=fluid.ParamAttr(
-                                     name="mask_lm_out_fc.w_0",
-                                     initializer=self._param_initializer),
-                                 bias_attr=mask_lm_out_bias_attr)
-        #mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-        #    logits=fc_out, label=mask_label)
-        mask_lm_softmax = fluid.layers.softmax(fc_out)
-        mask_lm_loss = fluid.layers.cross_entropy(mask_lm_softmax, mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(
-            mask_lm_loss, name="mean_mask_lm_loss")
-
-        return mask_lm_loss, mean_mask_lm_loss
-
-    def get_task_output(self, task, task_labels):
-        task_fc_out = fluid.layers.fc(input=self.next_sent_feat,
-                                      size=task["num_labels"],
-                                      param_attr=fluid.ParamAttr(
-                                          name=task["task_name"] + "_fc.w_0",
-                                          initializer=self._param_initializer),
-                                      bias_attr=task["task_name"] + "_fc.b_0")
-        #task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
-        #    logits=task_fc_out, label=task_labels, return_softmax=True)
-        task_softmax = fluid.layers.softmax(task_fc_out)
-        task_loss = fluid.layers.cross_entropy(task_softmax, task_labels)
-        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
-        mean_task_loss = fluid.layers.mean(task_loss)
-        return mean_task_loss, task_acc
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--run_on_ipu", type=bool, default=True, help="Run model with IPU")
-    parser.add_argument(
-        "--is_training", type=bool, default=True, help="Train of inference")
-    parser.add_argument(
-        "--num_ipus", type=int, default=2, help="Number of ipus")
-    parser.add_argument(
-        "--enable_pipelining", type=bool, default=False, help="Pipelining")
-    parser.add_argument(
-        "--save_model", type=bool, default=False, help="Save model or not")
-    parser.add_argument(
-        "--model_path", type=str, default="ernie", help="Save model to where")
-    parser.add_argument(
-        "--model_name", type=str, default="ernie", help="Save model name")
-    parser.add_argument(
-        "--ipu_run_steps", type=int, default=10, help="Number steps exe.run()")
-    parser.add_argument(
-        "--export_ops", type=bool, default=False, help="Export ops to ops.txt")
-    parser.add_argument(
-        "--export_ipu_idx", type=bool, default=False, help="Export op-idx pair")
-    args = parser.parse_args()
-
-    # set random seed
-    np.random.seed(SEED)
-    paddle.static.default_startup_program().random_seed = SEED
-    paddle.static.default_main_program().random_seed = SEED
-
-    # IPU doesn't support int64, so we change here
-    INT_DTYPE = "int32" if args.run_on_ipu else "int64"
-
-    # paddle input placeholder, batch_size = 1
-    micro_bs = 1
-    seq_len = ernie_config["seq_len"]
-    input_shape = [micro_bs, seq_len, 1]
-    input_fields = {
-        'names': [
-            'src_ids', 'sent_ids', 'pos_ids', 'input_mask', 'mask_label',
-            'mask_pos'
-        ],
-        'shapes': [
-            input_shape, input_shape, input_shape, input_shape, [micro_bs, 1],
-            [micro_bs, 1]
-        ],
-        'dtypes':
-        [INT_DTYPE, INT_DTYPE, INT_DTYPE, 'float32', INT_DTYPE, INT_DTYPE],
-        'range': [[0, seq_len], [0, 4], [0, seq_len], None, [0, seq_len],
-                  [0, seq_len]],
-        'lod_levels': [0, 0, 0, 0, 0, 0],
-    }
-
-    inputs = [
-        fluid.data(
-            name=input_fields['names'][i],
-            shape=input_fields['shapes'][i],
-            dtype=input_fields['dtypes'][i],
-            lod_level=input_fields['lod_levels'][i])
-        for i in range(len(input_fields['names']))
-    ]
-
-    # total_samples: assum disable pipelining
-    batches_per_step = 1
-    if args.enable_pipelining:
-        batches_per_step = \
-            ((args.num_ipus+1) if args.is_training else args.num_ipus)
-    total_samples = args.ipu_run_steps * batches_per_step
-
-    total_steps = args.ipu_run_steps
-    if not args.run_on_ipu:  # run on cpu
-        total_steps = total_samples // micro_bs
-
-    # synthetic data
-    np_inputs = []
-    for i in range(len(input_fields['names'])):
-        field_name = input_fields['names'][i]
-        if field_name == 'input_mask':
-            src_ids = np_inputs[0]
-            dtype = input_fields['dtypes'][i]
-            data = np.where(src_ids > 0,
-                            np.ones_like(src_ids),
-                            np.zeros_like(src_ids)).astype(dtype)
-        else:
-            shape = copy.copy(input_fields['shapes'][i])
-            shape[0] = total_samples
-            min_val, max_val = input_fields['range'][i]
-            data = np.random.randint(
-                min_val, max_val, shape, dtype=input_fields['dtypes'][i])
-        np_inputs.append(data)
-
-    # paddle input placeholder
-    (src_ids, sent_ids, pos_ids, input_mask, mask_label, mask_pos) = inputs
-
-    # ernie model
-    ernie = ErnieModel(src_ids, sent_ids, pos_ids, input_mask, ernie_config)
-    fetch_node = ernie.get_sequence_output()
-    if args.is_training:
-        with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-            _, mean_mask_lm_loss = ernie.get_lm_output(mask_label, mask_pos)
-            fetch_node = mean_mask_lm_loss
-            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-            adam.minimize(mean_mask_lm_loss)
-
-    # place = paddle.CPUPlace()
-    if args.run_on_ipu:
-        place = paddle.IPUPlace()
-    else:
-        place = paddle.CPUPlace()
-    executor = paddle.static.Executor(place)
-
-    # feed & fetch list
-    if args.is_training:
-        feed_list = input_fields['names']
-    else:
-        feed_list = input_fields['names'][:4]
-    fetch_list = [fetch_node.name]
-
-    # program
-    startup_prog = paddle.static.default_startup_program()
-    executor.run(startup_prog)
-
-    main_prog = paddle.static.default_main_program()
-    paddle.static.save(main_prog, "model/ernie")
-    paddle.static.load(main_prog, "model/ernie")
-
-    if args.run_on_ipu:
-        ipu_strategy = paddle.static.IpuStrategy()
-        ipu_strategy.SetGraphConfig(
-            num_ipus=args.num_ipus,
-            is_training=args.is_training,
-            enable_manual_shard=args.num_ipus > 1)
-        ipu_strategy.SetPipeliningConfig(
-            enable_pipelining=args.enable_pipelining,
-            batches_per_step=args.num_ipus + 1)
-
-        ipu_compiler = compiler.IPUCompiledProgram(
-            main_prog, ipu_strategy=ipu_strategy)
-        program = ipu_compiler.compile(feed_list, fetch_list)
-    else:
-        program = main_prog
-
-    # executor run
-    results = []
-    for i in range(total_steps):
-        start = i * (batches_per_step if args.run_on_ipu else 1)
-        end = start + (batches_per_step if args.run_on_ipu else 1)
-        feed_dict = {
-            src_ids.name: np_inputs[0][start:end],
-            sent_ids.name: np_inputs[1][start:end],
-            pos_ids.name: np_inputs[2][start:end],
-            input_mask.name: np_inputs[3][start:end]
-        }
-        if args.is_training:
-            feed_dict[mask_label.name] = np_inputs[4][start:end]
-            feed_dict[mask_pos.name] = np_inputs[5][start:end]
-
-        res = executor.run(program, feed=feed_dict, fetch_list=[fetch_node])
-        results.append(res)
-
-    paddle.static.save(main_prog, "model/ernie")
-
-    results = np.asarray(results).flatten()
-    if results.size > 32:
-        results = results[-32:]
-    print(results)
-
-    if args.save_model:
-        full_name = args.model_path + '/' + args.model_name
-        if args.is_training:
-            fluid.save(program=main_prog, model_path=full_name)
-        else:
-            with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-                paddle.static.save_inference_model(
-                    full_name, [src_ids, sent_ids, pos_ids, input_mask],
-                    [fetch_node], executor)
-
-    if args.export_ops:
-        op_type_list = []
-        for op in main_prog.global_block().ops:
-            op_type_list.append(op.desc.type())
-
-        with open("ops.txt", "w") as fp:
-            for op_type in set(op_type_list):
-                fp.write(op_type + os.linesep)
-
-    if args.export_ipu_idx:
-        op_ipu_idx_list = []
-        for op in main_prog.global_block().ops:
-            if op._is_backward_op():
-                continue
-
-            op_ipu_idx_pair = [op.desc.type()]
-            if op.desc.has_attr("ipu_index"):
-                op_ipu_idx_pair.append(op.desc.attr("ipu_index"))
-            else:
-                op_ipu_idx_pair.append(-1)  # not assign ipu_index
-            op_ipu_idx_list.append(op_ipu_idx_pair)
-        op_ipu_idx_list.sort(key=lambda item: item[-1])
-
-        with open("ops_ipu_idx.txt", "w") as fp:
-            for op_ipu_idx_pair in op_ipu_idx_list:
-                fp.write(str(op_ipu_idx_pair) + os.linesep)
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 0d09f604060..790388f30ea 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
 import unittest
-
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
-from typing import Optional
-import paddle.fluid.compiler as compiler
-
-SEED = 2021
+from enum import Enum
 
-ipu_compiler_ref: Optional[compiler.IPUCompiledProgram] = None
+import paddle
+import paddle.static
 
 map_np_dtype_to_fluid_dtype = {
     'bool': "bool",
@@ -36,6 +33,19 @@ map_np_dtype_to_fluid_dtype = {
 }
 
 
+class ExecutionMode(Enum):
+    CPU_FP32 = 1
+    IPU_FP32 = 2
+    # enable_fp16 through ipu_strategy.enable_fp16
+    IPU_POPART_FP16 = 3
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+    def __gt__(self, other):
+        return self.value > other.value
+
+
 def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
     return map_np_dtype_to_fluid_dtype[dtype.name]
 
@@ -43,14 +53,16 @@ def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
 class IPUOpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
+        # Get random seeds
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
 
-        cls.SEED = SEED
+        cls.SEED = 2021
         np.random.seed(cls.SEED)
         random.seed(cls.SEED)
 
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        # Enable paddle static graph mode
+        paddle.enable_static()
 
     @classmethod
     def tearDownClass(cls):
@@ -58,14 +70,47 @@ class IPUOpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
-        _set_use_system_allocator(cls._use_system_allocator)
-        # unittest will to trigger IPUCompiledProgram.__del__ automatically
-        global ipu_compiler_ref
-        ipu_compiler_ref is not None and ipu_compiler_ref.clean()
+    @classmethod
+    def use_ipumodel(cls):
+        if 'POPLAR_IPUMODEL' not in os.environ:
+            return False
+        else:
+            flag = os.environ['POPLAR_IPUMODEL']
+            if flag.upper() in ['1', "TRUE"]:
+                return True
 
     def set_atol(self):
-        self.atol = 1e-5
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
 
     def set_training(self):
         self.is_training = False
         self.epoch = 1
+
+    def check(self, outputs, check_shape=False):
+        cpu_fp32 = outputs[ExecutionMode.CPU_FP32]
+        ipu_fp32 = outputs[ExecutionMode.IPU_FP32]
+        max_diff = np.abs(cpu_fp32 - ipu_fp32).max()
+        fp32_flag = np.allclose(
+            cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol)
+        self.assertTrue(fp32_flag, "max diff is %f" % (max_diff))
+
+        if check_shape:
+            self.assertTrue(cpu_fp32.shape == ipu_fp32.shape)
+
+        ipu_popart_fp16 = None
+        if ExecutionMode.IPU_POPART_FP16 in outputs.keys():
+            ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16]
+            max_diff = np.abs(ipu_popart_fp16.astype(np.float32) -
+                              cpu_fp32).max()
+            fp16_flag = np.allclose(
+                ipu_popart_fp16.astype(np.float32),
+                cpu_fp32,
+                rtol=self.rtol_fp16,
+                atol=self.atol_fp16)
+            self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
+
+            if check_shape:
+                self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
new file mode 100644
index 00000000000..138365b650f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
@@ -0,0 +1,133 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestRelu(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.relu
+        self.op_attrs = {}
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = self.op(x, **self.op_attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTanh(TestRelu):
+    def set_test_op(self):
+        self.op = F.tanh
+        self.op_attrs = {}
+
+
+class TestLog(TestRelu):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.log
+        self.op_attrs = {}
+
+
+class TestSigmoid(TestRelu):
+    def set_test_op(self):
+        self.op = F.sigmoid
+        self.op_attrs = {}
+
+
+class TestSqrt(TestRelu):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sqrt
+        self.op_attrs = {}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
new file mode 100644
index 00000000000..d14eba98ef5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -0,0 +1,117 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[10, 1000])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {"axis": -1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.argmax(x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0].astype(np.int32)
+
+    def test_base(self):
+        output_dict_fp32 = {}
+        output_dict_fp16 = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+
+            if mode > ExecutionMode.IPU_FP32:
+                output_dict_fp16[mode] = self._test_base(mode).flatten()
+            else:
+                output_dict_fp32[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict_fp32)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"axis": 0}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
new file mode 100644
index 00000000000..4f17c90de72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                assign = paddle.assign(x)
+                out = paddle.fluid.layers.elementwise_add(assign, assign)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index a23cacf4763..f34e5b0d8b9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,78 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 2e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 128, 128])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
+
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    conv1, num_filters=3, filter_size=3, bias_attr=False)
-                conv3 = paddle.static.nn.conv2d(
-                    conv2, num_filters=3, filter_size=3, bias_attr=False)
-                conv4 = paddle.static.nn.conv2d(
-                    conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+                fetch_list = [x.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    num_ipus=2,
-                    is_training=self.is_training,
-                    enable_manual_shard=True,
-                    need_avg_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                ipu_strategy.set_options({'need_avg_shard': True})
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 87f783dbd1c..1dab958c1ec 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -16,13 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +27,100 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = False
         self.attrs['data_layout'] = 'NCHW'
         self.attrs['in_place'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 out = paddle.fluid.layers.batch_norm(conv1, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = True
         self.attrs['data_layout'] = 'NCHW'
@@ -108,7 +128,13 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = True
         self.attrs['data_layout'] = 'NCHW'
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
rename to python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
index 9b485d7794d..ef61e651b2a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@ SEED = 2021
                  "core is not compiled with IPU")
 class TestFunc(unittest.TestCase):
     def _test_func(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -40,22 +39,20 @@ class TestFunc(unittest.TestCase):
         c, h, w = 3, 10, 10
         np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[n, c, h, w], dtype='float32')
                 conv2d = paddle.static.nn.conv2d(
                     image, num_filters=3, filter_size=3, bias_attr=False)
 
-                # paddle.mean oshape on ipu is [bps], need another mean()
-                # paddle.mean oshape on cpu is [1]
-                # out = paddle.mean(conv2d)
                 out = conv2d
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
@@ -63,14 +60,9 @@ class TestFunc(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [out.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    num_ipus=2,
-                    is_training=False,
-                    enable_manual_shard=True,
-                    need_avg_shard=True)
-                ipu_strategy.SetPipeliningConfig(
-                    enable_pipelinin=True, batches_per_step=bps)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=False)
+                ipu_strategy.set_pipelining_config(batches_per_step=bps)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 6e58f809046..5f0eeaa2f99 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,14 +26,14 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
     def set_atol(self):
         self.atol = 1e-3
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
         }
@@ -47,23 +41,20 @@ class TestBase(IPUOpTest):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -82,8 +73,8 @@ class TestBase(IPUOpTest):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -103,27 +94,91 @@ class TestBase(IPUOpTest):
         self.assertTrue(res0.shape == res1.shape)
 
 
-class TestCase1(TestBase):
-    def set_attrs(self):
+class TestCase2(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'float32'
+
+
+class TestCase3(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'int32'
+
+
+class TestCase4(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'float32'
+
+
+class TestCase5(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'int32'
+
+
+class TestCase6(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
+        }
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
 
 @unittest.skip('float64 is not supported')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float64'
 
 
 @unittest.skip('skip float16 to float32')
 class TestCase3(TestBase):
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
@@ -133,13 +188,13 @@ class TestCase4(TestBase):
     def set_atol(self):
         self.atol = 1
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.randint(
                 low=1, high=100, size=[1, 3, 3, 3]).astype('int32'),
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'int8'
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index 094b19ce99d..c5a80902839 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,95 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+        data2 = np.random.uniform(size=[1, 3, 10, 10])
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        self.feed_fp32 = {
+            'x': data1.astype(np.float32),
+            'y': data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data1.astype(np.float16),
+            'y': data2.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.concat([x, y], **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 1}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index f28733de6b1..ade54fda869 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,20 +26,30 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['num_filters'] = 3
         self.attrs['filter_size'] = 3
@@ -54,104 +59,112 @@ class TestBase(IPUOpTest):
         self.attrs['groups'] = 1
         self.attrs['data_format'] = 'NCHW'
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.conv2d(image, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['num_filters'] = 1
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['filter_size'] = [3, 3]
 
 
 class TestCase2_1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['filter_size'] = [3, 2]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['stride'] = [2, 3]
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['dilation'] = [2, 2]
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['groups'] = 3
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = 2
 
 
 class TestCase7(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = [2, 3]
 
 
 class TestCase8(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = [1, 2, 2, 3]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index 3987c6cd5b3..3a21f0cb007 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,54 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 7]).astype('float32'),
-            "label": np.arange(3).reshape([3]).astype(np.int64),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 7])
+        label = np.arange(3).reshape([3, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {'soft_label': False, }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def np_nll_loss(self):
+        tmp = -np.log(self.feed_fp32['x'])
+        label = self.feed_fp32['label']
+        indice = [range(label.shape[0]), label.flatten()]
+        self.np_ref = tmp[indice]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
 
-                # [warning] Copying (host) tensor input/1 from INT64 to INT32.
-                #  Will only warn once
-                if run_ipu:
+                if exec_mode != ExecutionMode.CPU_FP32:
                     label = paddle.static.data(
                         name=self.feed_list[1],
                         shape=self.feed_shape[1],
@@ -80,52 +84,78 @@ class TestBase(IPUOpTest):
                         shape=self.feed_shape[1],
                         dtype='int64')
 
-                out = fluid.layers.cross_entropy(
+                out = paddle.fluid.layers.cross_entropy(
                     input=x, label=label, **self.attrs)
+
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed['label'] = feed['label'].astype(np.int32)
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(res0.shape == res1.shape)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+        self.np_nll_loss()
+
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'soft_label': False,
             'ignore_index': 1,
         }
 
 
-@unittest.skip("soft_label=True id not supported")
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[30, 70])
+        label = np.arange(30).reshape([30, 1])
+
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+@unittest.skip("soft_label=True is not supported")
+class TestCase3(TestBase):
+    def set_op_attrs(self):
         self.attrs = {'soft_label': True, }
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
new file mode 100644
index 00000000000..2f1d86daf00
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -0,0 +1,123 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    # popart unsupport fp16 cumsum
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 128])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="float32")
+
+                out = paddle.fluid.layers.cumsum(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": True, "reverse": False}
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": False, "reverse": True}
+
+
+class TestCase3(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": True, "reverse": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 6af2729e615a8d6b3b4f96964f1c71d20b8f5517 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Wed, 2 Mar 2022 15:45:28 +0800
Subject: [PATCH 052/272] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20gather?=
 =?UTF-8?q?=5Ftree,reduce=5Fprod=20to=20phi=20(#39844)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* move to phi

* migrate gather_tree_op into phi

* move reduce_prod tp phi

* optimize code
---
 paddle/fluid/operators/gather_tree_op.cc      |  4 +-
 paddle/fluid/operators/gather_tree_op.cu      | 84 -------------------
 paddle/fluid/operators/gather_tree_op.h       | 66 ---------------
 .../operators/reduce_ops/reduce_prod_op.cc    | 10 +--
 .../operators/reduce_ops/reduce_prod_op.h     |  7 --
 paddle/phi/kernels/cpu/gather_tree_kernel.cc  | 62 ++++++++++++++
 paddle/phi/kernels/cpu/reduce_prod_kernel.cc  | 44 ++++++++++
 paddle/phi/kernels/funcs/reduce_functor.h     |  8 ++
 .../kernels/gather_tree_kernel.h}             | 21 +++--
 paddle/phi/kernels/gpu/gather_tree_kernel.cu  | 79 +++++++++++++++++
 paddle/phi/kernels/gpu/reduce_prod_kernel.cu  | 43 ++++++++++
 paddle/phi/kernels/reduce_prod_kernel.h       | 29 +++++++
 paddle/phi/ops/compat/reduce_sig.cc           |  6 ++
 13 files changed, 285 insertions(+), 178 deletions(-)
 delete mode 100644 paddle/fluid/operators/gather_tree_op.cu
 delete mode 100644 paddle/fluid/operators/gather_tree_op.h
 create mode 100644 paddle/phi/kernels/cpu/gather_tree_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/reduce_prod_kernel.cc
 rename paddle/{fluid/operators/reduce_ops/reduce_prod_op.cu => phi/kernels/gather_tree_kernel.h} (51%)
 create mode 100644 paddle/phi/kernels/gpu/gather_tree_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/reduce_prod_kernel.cu
 create mode 100644 paddle/phi/kernels/reduce_prod_kernel.h

diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 830134e57e0..2868c3697ed 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_tree_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -73,5 +73,3 @@ selected ids.
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
-REGISTER_OP_CPU_KERNEL(gather_tree, ops::GatherTreeOpKernel<int32_t>,
-                       ops::GatherTreeOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu
deleted file mode 100644
index 829682764a6..00000000000
--- a/paddle/fluid/operators/gather_tree_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_tree_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void GatherTree(const T *ids_data, const T *parents_data,
-                           T *out_data, const int64_t max_length,
-                           const int64_t batch_size, const int64_t beam_size) {
-  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
-    int batch = i / beam_size;
-    int beam = i % beam_size;
-    auto idx =
-        (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
-    out_data[idx] = ids_data[idx];
-    auto parent = parents_data[idx];
-    for (int step = max_length - 2; step >= 0; step--) {
-      idx = step * batch_size * beam_size + batch * beam_size;
-      out_data[idx + beam] = ids_data[idx + parent];
-      parent = parents_data[idx + parent];
-    }
-  }
-}
-
-template <typename T>
-class GatherTreeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids = ctx.Input<Tensor>("Ids");
-    auto *parents = ctx.Input<Tensor>("Parents");
-    auto *out = ctx.Output<Tensor>("Out");
-
-    const auto *ids_data = ids->data<T>();
-    const auto *parents_data = parents->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ids_data, platform::errors::InvalidArgument(
-                      "Input(Ids) of gather_tree should not be null."));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        parents_data, platform::errors::InvalidArgument(
-                          "Input(Parents) of gather_tree should not be null."));
-
-    auto &ids_dims = ids->dims();
-    int64_t max_length = ids_dims[0];
-    int64_t batch_size = ids_dims[1];
-    int64_t beam_size = ids_dims[2];
-
-    auto &dev_ctx = ctx.cuda_device_context();
-
-    const int block = 512;
-    int max_threads =
-        std::min(static_cast<int64_t>(dev_ctx.GetMaxPhysicalThreadCount()),
-                 batch_size * beam_size);
-    const int grid = std::max(max_threads / block, 1);
-    GatherTree<<<grid, block>>>(ids_data, parents_data, out_data, max_length,
-                                batch_size, beam_size);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(gather_tree, ops::GatherTreeOpCUDAKernel<int32_t>,
-                        ops::GatherTreeOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h
deleted file mode 100644
index e035a30e795..00000000000
--- a/paddle/fluid/operators/gather_tree_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherTreeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids = ctx.Input<Tensor>("Ids");
-    auto *parents = ctx.Input<Tensor>("Parents");
-    auto *out = ctx.Output<Tensor>("Out");
-
-    const auto *ids_data = ids->data<T>();
-    const auto *parents_data = parents->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    auto &ids_dims = ids->dims();
-    auto max_length = ids_dims[0];
-    auto batch_size = ids_dims[1];
-    auto beam_size = ids_dims[2];
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ids_data, platform::errors::InvalidArgument(
-                      "Input(Ids) of gather_tree should not be null."));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        parents_data, platform::errors::InvalidArgument(
-                          "Input(Parents) of gather_tree should not be null."));
-
-    for (int batch = 0; batch < batch_size; batch++) {
-      for (int beam = 0; beam < beam_size; beam++) {
-        auto idx = (max_length - 1) * batch_size * beam_size +
-                   batch * beam_size + beam;
-        out_data[idx] = ids_data[idx];
-        auto parent = parents_data[idx];
-        for (int step = max_length - 2; step >= 0; step--) {
-          idx = step * batch_size * beam_size + batch * beam_size;
-          out_data[idx + beam] = ids_data[idx + parent];
-          parent = parents_data[idx + parent];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 50df75d9ad3..eb745ab9c56 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -27,15 +27,7 @@ class CPUDeviceContext;
 }  // namespace paddle
 
 REGISTER_REDUCE_OP(reduce_prod);
-REGISTER_OP_CPU_KERNEL(reduce_prod,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::ProdFunctor>);
+
 REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
                        ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                              float, ops::ProdGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
index 103e108e4bd..60dedf8d6ff 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
@@ -19,13 +19,6 @@
 namespace paddle {
 namespace operators {
 
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
 struct ProdGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
new file mode 100644
index 00000000000..25fb870d851
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  auto &ids_dims = ids.dims();
+  auto max_length = ids_dims[0];
+  auto batch_size = ids_dims[1];
+  auto beam_size = ids_dims[2];
+
+  PADDLE_ENFORCE_NOT_NULL(ids_data,
+                          phi::errors::InvalidArgument(
+                              "Input(Ids) of gather_tree should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      parents_data,
+      phi::errors::InvalidArgument(
+          "Input(Parents) of gather_tree should not be null."));
+
+  for (int batch = 0; batch < batch_size; batch++) {
+    for (int beam = 0; beam < beam_size; beam++) {
+      auto idx =
+          (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
+      out_data[idx] = ids_data[idx];
+      auto parent = parents_data[idx];
+      for (int step = max_length - 2; step >= 0; step--) {
+        idx = step * batch_size * beam_size + batch * beam_size;
+        out_data[idx + beam] = ids_data[idx + parent];
+        parent = parents_data[idx + parent];
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gather_tree, CPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
new file mode 100644
index 00000000000..cf0179124eb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reduce_prod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index ce8e095e8ac..aebd155ac59 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -33,5 +33,13 @@ struct MeanFunctor {
   }
 };
 
+//////// Prod Functor ///////
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/phi/kernels/gather_tree_kernel.h
similarity index 51%
rename from paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
rename to paddle/phi/kernels/gather_tree_kernel.h
index 2de647df8b1..e5a1a684dae 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/phi/kernels/gather_tree_kernel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod,
-    ops::ReduceCudaKernel<float, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MulFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
new file mode 100644
index 00000000000..a9e73ec37c8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GatherTree(const T *ids_data,
+                           const T *parents_data,
+                           T *out_data,
+                           const int64_t max_length,
+                           const int64_t batch_size,
+                           const int64_t beam_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
+    int batch = i / beam_size;
+    int beam = i % beam_size;
+    auto idx =
+        (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
+    out_data[idx] = ids_data[idx];
+    auto parent = parents_data[idx];
+    for (int step = max_length - 2; step >= 0; step--) {
+      idx = step * batch_size * beam_size + batch * beam_size;
+      out_data[idx + beam] = ids_data[idx + parent];
+      parent = parents_data[idx + parent];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  PADDLE_ENFORCE_NOT_NULL(ids_data,
+                          phi::errors::InvalidArgument(
+                              "Input(Ids) of gather_tree should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      parents_data,
+      phi::errors::InvalidArgument(
+          "Input(Parents) of gather_tree should not be null."));
+
+  auto &ids_dims = ids.dims();
+  int64_t max_length = ids_dims[0];
+  int64_t batch_size = ids_dims[1];
+  int64_t beam_size = ids_dims[2];
+
+  const int block = 512;
+  int max_threads =
+      std::min(static_cast<int64_t>(dev_ctx.GetMaxPhysicalThreadCount()),
+               batch_size * beam_size);
+  const int grid = std::max(max_threads / block, 1);
+  GatherTree<<<grid, block>>>(
+      ids_data, parents_data, out_data, max_length, batch_size, beam_size);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gather_tree, GPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
new file mode 100644
index 00000000000..14084d0f4f3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reduce_prod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/reduce_prod_kernel.h
new file mode 100644
index 00000000000..5e92b6c4db1
--- /dev/null
+++ b/paddle/phi/kernels/reduce_prod_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 6395486ed2b..92839fb3030 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -51,6 +51,11 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
@@ -58,3 +63,4 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
-- 
GitLab


From c9cd47d96b2cccb34d8dc269a055f5b64346a10e Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Wed, 2 Mar 2022 15:58:57 +0800
Subject: [PATCH 053/272] [Auto Parallel] Adapt Partitioner & DistOp for
 ERNIE3.0 Inference and cache (#39895)

* adapot dist op

* add dist_fill_constant_batch_size_like

* remvoe print

* update compitable

* add unitest
---
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/dist_eltwise.py   |   0
 .../auto_parallel/operators/dist_embedding.py |   5 +-
 .../dist_fill_constant_batch_size_like.py     | 127 ++++++++++++++++++
 .../auto_parallel/operators/dist_matmul.py    |   8 +-
 .../distributed/auto_parallel/partitioner.py  |   3 +
 .../test_auto_parallel_while_op.py            |  28 ++++
 7 files changed, 168 insertions(+), 4 deletions(-)
 mode change 100755 => 100644 python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py

diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 9f84df2d896..db6f909f8ca 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -27,3 +27,4 @@ from . import dist_eltwise
 from . import dist_check_finite_and_unscale
 from . import dist_update_loss_scaling
 from . import dist_split
+from . import dist_fill_constant_batch_size_like
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
old mode 100755
new mode 100644
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 94eb0d2d469..32f8e2acef5 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -155,7 +155,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
             kwargs['Out'])
 
         Ids_var = main_block.var(kwargs['Ids'][0])
-        Weight_var = main_block.var(kwargs['W'][0])
+        Weight_var = main_block._var_recursive(kwargs['W'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # got dist attribute info
@@ -277,7 +277,8 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
 
         # param initialization sync
         if Weight_var.is_parameter and not op_dist_attr.is_recompute:
-            assert Weight_var.name not in dist_op_context.already_init_sync_vars
+            if Weight_var.name in dist_op_context.already_init_sync_vars:
+                return
             dist_op_context.already_init_sync_vars.add(Weight_var.name)
             param = startup_block.var(Weight_var.name)
             param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
new file mode 100644
index 00000000000..0c9d9eda02e
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard
+from ..utils import is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedFillConstantBatchSizeLike, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like"))
+
+
+class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        shape_list = op_desc.attr("shape")
+
+        if len(shape_list) != len(out_dims_mapping):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        in_name = op_desc.input('Input')[0]
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+
+        # the dim_mapping of batch dimension should be the same
+        return out_dims_mapping[0] == in_dims_mapping[0]
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        # only the batch size dimemsion of input and output are relative.
+        dim_changed = compute_compatible_and_update_dim_mapping(
+            [x_dims_mapping, out_dims_mapping], [0, 0])
+        if dim_changed:
+            changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+        dist_op_context = ctx.dist_op_context
+        src_op = dist_op_context.cur_src_op
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        main_block = dist_op_context.work_block
+        op = main_block.ops[-1]
+        assert op.type == "fill_constant_batch_size_like"
+
+        # modify shape attr according to how output are partitioned
+        out_name = op.output('Out')[0]
+        dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        shape_list = op.attr("shape")
+        # modify target shape
+        for idx, axis in enumerate(dims_mapping):
+            if axis >= 0:
+                shape_list[idx] = shape_list[idx] // process_mesh_shape[axis]
+
+        op._set_attr("shape", shape_list)
+        main_block._sync_with_cpp()
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl(
+    "fill_constant_batch_size_like",
+    DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 9eb24a65e60..058ae1d0a9f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -433,8 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format(
-        Weight_var.name, dist_op_context.already_init_sync_vars)
+    if Weight_var.name in dist_op_context.already_init_sync_vars:
+        return
     assert startup_block.has_var(Weight_var.name)
     dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
@@ -819,6 +819,8 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
@@ -1323,6 +1325,8 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 2f88407c093..ed5ec85d84f 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -285,6 +285,9 @@ def _get_dist_shape(var, dist_attr):
     var_shape = var.shape
     mapping = dist_attr.dims_mapping
     mesh = dist_attr.process_mesh.topology
+    if mapping == []:
+        return var_shape
+
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
index 1cd8f8f3e70..07e6a2c4346 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
@@ -174,6 +174,7 @@ def get_program():
             dtype='float32')
         label = static.data(
             name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+
         data_holder = [input, label]
         # dataloader
         dataloader = paddle.io.DataLoader.from_generator(
@@ -194,6 +195,17 @@ def get_program():
                 "dims_mapping": [-1, -1, -1]
             })
 
+        # fill constant bsz like
+        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
+            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
+        auto.shard_tensor(
+            tmp,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, 0, -1, -1]
+            })
+
+        # model
         mlp_start = MLPLayer(
             hidden_size=hidden_size,
             intermediate_size=4 * hidden_size,
@@ -395,6 +407,9 @@ def completion(train_program, start_program, dist_context):
                         op_dist_attr.impl_idx = 0
                     else:
                         op_dist_attr.impl_idx = 1
+            elif op.type == "fill_constant_batch_size_like":
+                op_dist_attr.impl_type = "fill_constant_batch_size_like"
+                op_dist_attr.impl_idx = 0
             else:
                 op_dist_attr.impl_type = "default"
                 op_dist_attr.impl_idx = 0
@@ -428,6 +443,12 @@ class TestMLP(unittest.TestCase):
         dist_main_prog, dist_startup_prog = partition(
             train_program, start_program, dist_context)
         global_block_ops = dist_main_prog.blocks[0].ops
+
+        fill_op = None
+        for op in global_block_ops:
+            if op.type == "fill_constant_batch_size_like":
+                fill_op = op
+
         global_block_ops = [op.type for op in global_block_ops]
         sub_block_ops = dist_main_prog.blocks[1].ops
         sub_block_ops = [op.type for op in sub_block_ops]
@@ -435,6 +456,13 @@ class TestMLP(unittest.TestCase):
         self.assertTrue("c_allreduce_sum" in global_block_ops)
         self.assertTrue("c_allreduce_sum" in sub_block_ops)
 
+        # test fill_constant_batch_size_like
+
+        self.assertTrue(fill_op is not None)
+        ref_shape = [-1, 8, 0, 48]
+        shape = fill_op.attr("shape")
+        self.assertTrue(ref_shape == shape)
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 4a4215ffad5efada31dcdae9262a806635b1f226 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Wed, 2 Mar 2022 16:14:31 +0800
Subject: [PATCH 054/272]  [bf16] add bf16 kernel: softmax & log_softmax
 (#39999)

* add softmax log_softmax

* refine rocm

* refine unittest
---
 paddle/fluid/operators/log_softmax_op.cu      | 16 ++--
 paddle/fluid/operators/math/softmax.cu        | 13 +++
 paddle/fluid/operators/math/softmax_impl.h    | 91 +++++++++++++++++++
 .../platform/device/gpu/rocm/miopen_helper.h  | 17 ++++
 paddle/phi/common/amp_type_traits.h           | 42 +++++++++
 paddle/phi/common/bfloat16.h                  | 18 ++--
 paddle/phi/common/float16.h                   | 12 ---
 paddle/phi/kernels/gpu/softmax_grad_kernel.cu |  4 +-
 paddle/phi/kernels/gpu/softmax_kernel.cu      |  4 +-
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    | 12 +++
 .../gpudnn/softmax_grad_kernel_gpudnn.cu      | 14 ++-
 .../kernels/gpudnn/softmax_kernel_gpudnn.cu   | 14 ++-
 .../fluid/tests/unittests/test_log_softmax.py | 30 +++++-
 .../fluid/tests/unittests/test_softmax_op.py  | 52 ++++++++++-
 14 files changed, 305 insertions(+), 34 deletions(-)
 create mode 100644 paddle/phi/common/amp_type_traits.h

diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 034e67568b3..8770abdac83 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include <limits>
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
@@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
 template <typename T>
 class LogSoftmaxKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
 template <typename T>
 class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -468,16 +468,18 @@ class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
   }
 };
 
-}  // operators
-}  // paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(
     log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index fd879e9e6ff..83b124902eb 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<platform::bfloat16>;
+template class SoftmaxGradCUDNNFunctor<platform::bfloat16>;
+#endif
 
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
@@ -131,6 +135,10 @@ template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                               false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                               true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              true>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
@@ -139,9 +147,13 @@ template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
                                   platform::float16>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext,
+                                  platform::bfloat16>;
 
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, true>;
 template class SoftmaxFunctor<phi::GPUContext, float, false>;
 template class SoftmaxFunctor<phi::GPUContext, double, false>;
 template class SoftmaxFunctor<phi::GPUContext, float, true>;
@@ -149,6 +161,7 @@ template class SoftmaxFunctor<phi::GPUContext, double, true>;
 template class SoftmaxGradFunctor<phi::GPUContext, float>;
 template class SoftmaxGradFunctor<phi::GPUContext, double>;
 template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index d51d638e0c1..9833b4447ec 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -156,6 +156,65 @@ class SoftmaxEigen<DeviceContext, platform::float16, is_test> {
   }
 };
 
+template <typename DeviceContext, bool is_test>
+class SoftmaxEigen<DeviceContext, platform::bfloat16, is_test> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* X, framework::Tensor* Y) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    auto logits = EigenMatrix<platform::bfloat16>::From(*X);
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*Y);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    }
+
+    softmax.device(*context.eigen_device()) = softmax.exp();
+    softmax.device(*context.eigen_device()) =
+        (softmax *
+         softmax.reshape(batch_axis_remain)
+             .sum(along_axis)
+             .inverse()
+             .broadcast(one_axis));
+  }
+};
+
 template <typename DeviceContext, typename T, bool is_test, typename Enable>
 void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
     const DeviceContext& context, const int axis_dim,
@@ -289,6 +348,38 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
   }
 };
 
+template <typename DeviceContext>
+class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
+    auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
+    auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
+
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    auto dot = (softmax * softmax_grad)
+                   .reshape(batch_axis_remain)
+                   .sum(along_class)
+                   .broadcast(one_axis);
+    logits_grad.device(*context.eigen_device()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
+
 template <typename DeviceContext, typename T, typename Enable>
 void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
     const DeviceContext& context, const int axis_dim,
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index 34b9d57e055..1a514d2aca2 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -140,6 +140,23 @@ class CudnnDataType<float16> {
   }
 };
 
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+  static const miopenDataType_t type = miopenBFloat16;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
 template <>
 class CudnnDataType<float> {
  public:
diff --git a/paddle/phi/common/amp_type_traits.h b/paddle/phi/common/amp_type_traits.h
new file mode 100644
index 00000000000..ce3a469f5ae
--- /dev/null
+++ b/paddle/phi/common/amp_type_traits.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+namespace dtype {
+
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<phi::dtype::float16> {
+ public:
+  using Type = float;
+};
+
+template <>
+class MPTypeTrait<phi::dtype::bfloat16> {
+ public:
+  using Type = float;
+};
+
+}  // namespace dtype
+}  // namespace phi
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 3fd8eb1b268..cf99bb8f19a 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -377,31 +377,31 @@ struct numeric_limits<phi::dtype::bfloat16> {
   static const bool traps = true;
   static const bool tinyness_before = false;
 
-  static phi::dtype::bfloat16(min)() {
+  HOSTDEVICE static phi::dtype::bfloat16(min)() {
     return phi::dtype::raw_uint16_to_bfloat16(0x007f);
   }
-  static phi::dtype::bfloat16 lowest() {
+  HOSTDEVICE static phi::dtype::bfloat16 lowest() {
     return phi::dtype::raw_uint16_to_bfloat16(0xff7f);
   }
-  static phi::dtype::bfloat16(max)() {
+  HOSTDEVICE static phi::dtype::bfloat16(max)() {
     return phi::dtype::raw_uint16_to_bfloat16(0x7f7f);
   }
-  static phi::dtype::bfloat16 epsilon() {
+  HOSTDEVICE static phi::dtype::bfloat16 epsilon() {
     return phi::dtype::raw_uint16_to_bfloat16(0x3400);
   }
-  static phi::dtype::bfloat16 round_error() {
+  HOSTDEVICE static phi::dtype::bfloat16 round_error() {
     return phi::dtype::bfloat16(0.5);
   }
-  static phi::dtype::bfloat16 infinity() {
+  HOSTDEVICE static phi::dtype::bfloat16 infinity() {
     return phi::dtype::raw_uint16_to_bfloat16(0x7f80);
   }
-  static phi::dtype::bfloat16 quiet_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 quiet_NaN() {
     return phi::dtype::raw_uint16_to_bfloat16(0xffc1);
   }
-  static phi::dtype::bfloat16 signaling_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 signaling_NaN() {
     return phi::dtype::raw_uint16_to_bfloat16(0xff81);
   }
-  static phi::dtype::bfloat16 denorm_min() {
+  HOSTDEVICE static phi::dtype::bfloat16 denorm_min() {
     return phi::dtype::raw_uint16_to_bfloat16(0x0001);
   }
 };
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 6ed9c88d705..1cdcdef2c12 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -988,18 +988,6 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) {
   return os;
 }
 
-template <typename T>
-class MPTypeTrait {
- public:
-  using Type = T;
-};
-
-template <>
-class MPTypeTrait<float16> {
- public:
-  using Type = float;
-};
-
 }  // namespace dtype
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
index aa496d3cd39..04052e0dfc3 100644
--- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax_grad,
                    phi::SoftmaxGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 32efb9b7764..03c5714b967 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax,
                    phi::SoftmaxRawKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 45798b88bb5..c9c549379bb 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
@@ -47,6 +49,11 @@ class VecT4<phi::dtype::float16> {
  public:
   using Type = int2;
 };
+template <>
+class VecT4<phi::dtype::bfloat16> {
+ public:
+  using Type = int2;
+};
 
 // Vectorization trait 2 * sizeof(T)
 template <typename T>
@@ -66,6 +73,11 @@ class VecT2<phi::dtype::float16> {
  public:
   using Type = int;
 };
+template <>
+class VecT2<phi::dtype::bfloat16> {
+ public:
+  using Type = int;
+};
 
 static inline int log2_ceil(int value) {
   int log2_value = 0;
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
index 56e5fef6e37..45ab645d373 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
@@ -38,7 +38,18 @@ PD_REGISTER_KERNEL(softmax_grad,
                    ALL_LAYOUT,
                    phi::SoftmaxGradGPUDNNKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
@@ -48,3 +59,4 @@ PD_REGISTER_KERNEL(softmax_grad,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
index 427d1729a13..7685c7dbb68 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
@@ -37,7 +37,18 @@ PD_REGISTER_KERNEL(softmax,
                    ALL_LAYOUT,
                    phi::SoftmaxRawGPUDNNKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
@@ -47,3 +58,4 @@ PD_REGISTER_KERNEL(softmax,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index d1437ca9c96..16f954708d4 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,8 +14,9 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 import paddle
+import paddle.fluid.core as core
 import paddle.nn.functional as F
 
 np.random.seed(10)
@@ -74,6 +75,33 @@ class TestLogSoftmaxAxis(TestLogSoftmaxOp):
         self.axis = 1
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLogSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.dtype = np.uint16
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+
+        x = np.random.uniform(0.1, 1., self.shape).astype(np.float32)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
 class TestNNLogSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index a1cbefa40f3..4f1c37a2424 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
@@ -296,6 +296,56 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.use_cudnn = self.init_cudnn()
+        self.use_mkldnn = False
+        self.dtype = np.uint16
+        self.shape = [10, 10]
+        self.axis = -1
+
+        np.random.seed(0)
+        x = np.random.uniform(0.1, 1, self.shape).astype(np.float32)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {
+            'axis': self.axis,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
+        }
+
+    def init_cudnn(self):
+        return False
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(
+            place, check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ["X"],
+            "Out",
+            numeric_grad_delta=0.05,
+            check_dygraph=(self.use_mkldnn == False))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op):
+    def init_cudnn(self):
+        return True
+
+
 class TestSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-- 
GitLab


From 07dad6d6ec415758d520e33960a0c53e50ef2ab5 Mon Sep 17 00:00:00 2001
From: huzhiqiang <912790387@qq.com>
Date: Wed, 2 Mar 2022 02:16:04 -0600
Subject: [PATCH 055/272] [Infrt]add phi kernel dialect (#39726)

---
 .gitignore                                    |   3 +
 .../pybind/kernel_signature_generator.cc      |  26 +-
 paddle/infrt/dialect/infrt/common_type.h      |  18 +-
 paddle/infrt/dialect/infrt/infrt_ops_base.td  |   7 +-
 paddle/infrt/dialect/init_infrt_dialects.cc   |   4 +
 paddle/infrt/dialect/phi/ir/CMakeLists.txt    |   7 +-
 .../infrt/dialect/phi/ir/infrt_phi_kernel.td  |  24 +-
 .../infrt/dialect/phi/ir/infrt_phi_tensor.td  |  11 +-
 paddle/infrt/dialect/phi/ir/phi_kernels.cc    |  44 +++
 paddle/infrt/dialect/phi/ir/phi_kernels.h     |  42 +++
 .../infrt/dialect/phi/pass/kernel_op_desc.cc  |  45 ++-
 paddle/infrt/host_context/mlir_exec.cc        |   2 +
 paddle/infrt/kernel/phi/context_kernels.cc    |   8 +-
 paddle/infrt/kernel/phi/context_kernels.h     |   3 +-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  |  34 ++-
 .../infrt/kernel/phi/dense_tensor_kernels.h   |   3 +-
 .../infershaped/infershape_launchers_test.cc  |   2 +-
 paddle/infrt/kernel/phi/registry.cc           |   2 +
 .../tests/dialect/pten/dense_tensor.mlir      |  12 +-
 paddle/scripts/infrt_build.sh                 |   4 +-
 tools/infrt/generate_phi_kernel_dialect.py    | 276 ++++++++++++++++++
 tools/infrt/get_phi_kernel_info.py            |  12 +-
 22 files changed, 536 insertions(+), 53 deletions(-)
 create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.cc
 create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.h
 create mode 100644 tools/infrt/generate_phi_kernel_dialect.py

diff --git a/.gitignore b/.gitignore
index cecd6fa91c7..debec551d9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,9 @@ tools/__pycache__
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
 paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
+paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
+tools/infrt/kernels.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 8283a249ded..f0d5a4e477f 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -49,24 +49,30 @@ int main(int argc, char **argv) {
     if (kernel_signature_map.Has(op_kernel_pair.first)) {
       std::cout << "\"" << op_kernel_pair.first << "\":{";
       auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
+
       std::cout << "\"inputs\":[";
-      for (auto name : std::get<0>(args)) {
-        std::cout << "\"" << name << "\",";
+      auto inputs_ = std::get<0>(args);
+      if (inputs_.size() > 0) std::cout << inputs_[0];
+      for (size_t i = 1; i < inputs_.size(); i++) {
+        std::cout << ",\"" << inputs_[i] << "\"";
       }
-      if (std::get<0>(args).size() > 0) std::cout << "\b";
+
       std::cout << "],\"attrs\":[";
-      for (auto name : std::get<1>(args)) {
-        std::cout << "\"" << name << "\",";
+      auto attrs_ = std::get<1>(args);
+      if (attrs_.size() > 0) std::cout << attrs_[0];
+      for (size_t i = 1; i < attrs_.size(); i++) {
+        std::cout << ",\"" << attrs_[i] << "\"";
       }
-      if (std::get<1>(args).size() > 0) std::cout << "\b";
+
       std::cout << "],\"outputs\":[";
-      for (auto name : std::get<2>(args)) {
-        std::cout << "\"" << name << "\",";
+      auto outputs_ = std::get<2>(args);
+      for (size_t i = 1; i < outputs_.size(); i++) {
+        std::cout << ",\"" << outputs_[i] << "\"";
       }
-      if (std::get<2>(args).size() > 0) std::cout << "\b";
+
       std::cout << "]},";
     }
   }
-  std::cout << "\b}" << std::endl;
+  std::cout << "}" << std::endl;
   return 0;
 }
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h
index d6d6503c03b..436e7920ca5 100644
--- a/paddle/infrt/dialect/infrt/common_type.h
+++ b/paddle/infrt/dialect/infrt/common_type.h
@@ -21,8 +21,22 @@
 namespace infrt {
 
 enum class TargetType : uint8_t { CPU, GPU, UNK };
-enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK };
-enum class LayoutType : uint8_t { NCHW, NHWC, UNK };
+enum class LayoutType : uint8_t { NCHW, NHWC, ANY, UNK };
+enum class PrecisionType : uint8_t {
+  UINT8,
+  INT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  BFLOAT16,
+  FLOAT32,
+  FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
+  BOOL,
+  UNK
+};
 
 struct Place {
   TargetType target;
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index 978b126d754..f19912dc0cd 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -34,9 +34,10 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   let summary = "infrt dense tensor";
   let description = [{dense_tensor<, 3>}];
   let parameters = (ins
-    "TargetType":$target,
-    "PrecisionType":$precision,
-    "LayoutType":$layout
+    "::infrt::TargetType":$target,
+    "::infrt::PrecisionType":$precision,
+    "::infrt::LayoutType":$layout
+
   );
 }
 
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index c5c81b4b0f2..5eae0171936 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -23,6 +23,8 @@
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
@@ -34,6 +36,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   mlir::pd::PaddleDialect,
 #ifdef INFRT_WITH_PHI
                   phi::PHIDenseTensorDialect,
+                  phi::PHICPUKernelDialect,
+                  phi::PHIGPUKernelDialect,
                   phi::PHIDialect
 #endif
                   >();
diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
index 8c1d75629d0..0497b983211 100644
--- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
@@ -1,9 +1,12 @@
 #mlir_tablegen_on(infrt_phi_base DIALECT phi)
 add_mlir_dialect(infrt_phi_base phi)
 add_mlir_dialect(infrt_phi_tensor phi_dt)
-add_mlir_dialect(infrt_phi_kernel phi_kernel)
+add_mlir_dialect(phi_cpu_kernels phi_cpu)
+add_mlir_dialect(phi_gpu_kernels phi_gpu)
+
 #mlir_tablegen_on(infrt_phi_tensor)
 
 gather_srcs(infrt_src SRCS
     phi_base.cc 
-    infrt_phi_tensor.cc)
+    infrt_phi_tensor.cc
+    phi_kernels.cc)
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index 37bf0b5ef21..ee23470fc75 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -6,24 +6,32 @@ include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
-def PHI_KernelDialect : Dialect {
-  let name = "phi_kernel";
+def PHI_CPUKernelDialect : Dialect {
+  let name = "phi_cpu";
 
   let description = [{
-    The PHI Kernel dialect.
+    The PHI CPU Kernel dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+def PHI_GPUKernelDialect : Dialect {
+  let name = "phi_gpu";
+
+  let description = [{
+    The PHI GPU Kernel dialect.
   }];
 
   let cppNamespace = "::infrt::phi";
 }
 
 // PHI Kernel related ops.
-class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
+class PDTCPU_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_CPUKernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
-def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x);
-  let results = (outs DenseTensor:$output);
+// PHI Kernel related ops.
+class PDTGPU_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_GPUKernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
 #endif
-
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index dc3a4b340d7..39677871ff8 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -34,6 +34,14 @@ class FillDenseTensorOp<Attr attr_type, string dtype> :
       attr_type:$value
   );
   let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
+}
+
+class PrintDenseTensorOp:
+      PDT_Op<"print_tensor"> {
+  let arguments = (ins DenseTensor:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
 
 class CreateCPUAllocatorOp
@@ -44,7 +52,7 @@ class CreateCPUAllocatorOp
 
 class CreateCPUContextOp
       : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins);
+  let arguments = (ins CPU_Allocator:$input);
   let results = (outs CPU_Context:$output);
 }
 
@@ -52,6 +60,7 @@ def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nc
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
 def PDT_CreateContextOp_cpu : CreateCPUContextOp;
+def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
   let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
new file mode 100644
index 00000000000..c7a837b83fc
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+#include <mlir/IR/BuiltinTypes.h>
+
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
+
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
+
+namespace infrt {
+namespace phi {
+
+void PHICPUKernelDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
+      >();
+}
+
+void PHIGPUKernelDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
+      >();
+}
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
new file mode 100644
index 00000000000..b84d1b2b729
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc"
+
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 63869b7d7b9..6c0f6df8921 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -41,26 +41,49 @@ TargetType cvtTargetFromPhi(phi::Backend backend) {
 }
 
 phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
   switch (precision) {
-    case PrecisionType::FLOAT32:
-      return phi::DataType::FLOAT32;
-      break;
-    case PrecisionType::FLOAT16:
-      return phi::DataType::FLOAT16;
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
     default:
       return phi::DataType::UNDEFINED;
   }
+#undef CONVERT_PRECISION_TO_PHI
 }
 
 PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
   switch (datatype) {
-    case phi::DataType::FLOAT32:
-      return PrecisionType::FLOAT32;
-    case phi::DataType::FLOAT16:
-      return PrecisionType::FLOAT16;
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
     default:
       return PrecisionType::UNK;
   }
+#undef CONVERT_PRECISION_FROM_PHI
 }
 
 phi::DataLayout cvtLayout2Phi(LayoutType layout) {
@@ -69,6 +92,8 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) {
       return phi::DataLayout::NCHW;
     case LayoutType::NHWC:
       return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
     default:
       return phi::DataLayout::UNDEFINED;
   }
@@ -80,6 +105,8 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
       return LayoutType::NCHW;
     case phi::DataLayout::NHWC:
       return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
     default:
       return LayoutType::UNK;
   }
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 79717ba2cc0..7823681079f 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -29,6 +29,7 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
 #endif
 
@@ -58,6 +59,7 @@ int main(int argc, char** argv) {
   kernel::RegisterControlFlowKernels(&registry);
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
+  kernel::RegisterInferShapeLaunchers(&registry);
 #endif
 
   // load extra shared library
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 5284f499916..3caaf1788e3 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,7 +18,13 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext() { return {}; }
+::phi::CPUContext CreateCpuContext(
+    infrt::backends::CpuPhiAllocator* allocator) {
+  ::phi::CPUContext context;
+  context.SetAllocator(allocator);
+  context.Init();
+  return context;
+}
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 8082dc6c2ff..7f1e7ef6cd3 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -21,7 +22,7 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext();
+::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*);
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index ce9200b9918..871336e8762 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
-
+#include <iostream>
 namespace infrt {
 namespace kernel {
 namespace phi {
@@ -30,8 +30,38 @@ namespace phi {
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<int64_t>> values) {}
+                        host_context::Attribute<std::vector<float>> values) {
+  auto place = ::phi::CPUPlace();
+  float* a_data = dense_tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+    a_data[i] = (values.get())[i];
+  }
+}
 
+void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
+  case ::phi::DataType::PHI_DATATYPE: {                   \
+    DTYPE* data = dense_tensor->data<DTYPE>();            \
+    if (dense_tensor->numel() == 0) break;                \
+    std::cout << data[0];                                 \
+    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+      std::cout << "," << data[i];                        \
+    }                                                     \
+    break;                                                \
+  }
+
+  ::phi::DDim dims = dense_tensor->dims();
+  std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
+            << " values=[";
+  switch (dense_tensor->dtype()) {
+    PRINT_META_DATA(FLOAT32, float);
+    PRINT_META_DATA(INT32, int32_t);
+    default:
+      std::cout << "Error! Unsupported data type!\n";
+  }
+  std::cout << "]\n";
+#undef PRINT_META_DATA
+}
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 25daf7027e8..920c0b1c8af 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -28,7 +28,8 @@ namespace phi {
     host_context::Attribute<std::vector<int64_t>> lod);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<int64_t>> values);
+                        host_context::Attribute<std::vector<float>> values);
+void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 2161e98fac8..37f9197edb7 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("pten.add.cpu.any.fp32");
+  auto creator = registry.GetKernel("phi_cpu.add.any.float32");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 5d79814d4be..15e2d21005e 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -42,6 +42,8 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
       INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
+  registry->AddKernel("phi_dt.print_tensor",
+                      INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
   registry->AddKernel(
       "phi_dt.fake_phi_kernel",
       std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
index f0b0b849b93..695143c93b3 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
@@ -1,13 +1,15 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
-// CHECK-LABEL: @fake_phi_kernel_execute
-func @fake_phi_kernel_execute() {
+// CHECK-LABEL: @sign_any_float32_execute
+func @sign_any_float32_execute() {
   %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
-  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
+  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context
   %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  // CHECK: @FakePhiKernel@
-  %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  // CHECK: dense_tensor: shape=shape[1], values=[1]
+  "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   Infrt.return
 }
 
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index a0132501387..75b27e4165d 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -33,14 +33,16 @@ function update_pd_ops() {
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
    cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
-   make -j8 paddle_python
+   make -j8 paddle_python print_pten_kernels
    cd ${PADDLE_ROOT}/build
+   ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
    cd python/dist/
    python3 -m pip uninstall -y paddlepaddle
    python3 -m pip install  *whl
    # update pd_ops.td
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
+   python3 generate_phi_kernel_dialect.py ./kernels.json
 }
 
 function init() {
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
new file mode 100644
index 00000000000..80cf3958b15
--- /dev/null
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+
+attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'}
+supported_kernels = ['sign', 'dot', 'digamma', 'conj']
+
+target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
+layout_type_converter = {
+    "NCHW": "NCHW",
+    "NHWC": "NHWC",
+    "Undefined(AnyLayout)": "ANY"
+}
+precision_type_converter = {
+    "uint8": "UINT8",
+    "int8": "INT8",
+    "int16": "INT16",
+    "int32": "INT32",
+    "int64": "INT64",
+    "float16": "FLOAT16",
+    "bfloat16": "BFLOAT16",
+    "float32": "FLOAT32",
+    "float64": "FLOAT64",
+    "complex64": "COMPLEX64",
+    "complex128": "COMPLEX128",
+    "bool": "BOOL"
+}
+
+
+def generate_kernel_name(op_name, place_str):
+    [target_, layout_, precision_] = place_str[1:-1].split(',')
+    target_ = target_type_converter[target_.strip()]
+    layout_ = layout_type_converter[layout_.strip()]
+    precision_ = precision_type_converter[precision_.strip()]
+    alias_ = "{}.{}".format(op_name, ".".join(
+        [target_.strip(), layout_.strip(), precision_.strip()]))
+    return alias_
+
+
+def generate_attrs_info(op_name, attrs_info):
+    kernel_attrs_names = {
+        'split': ['sections', 'num', 'axis', 'mkldnn_data_type'],
+        'sign': [],
+        'masked_select': [],
+        'trace': ['offset', 'axis1', 'axis2'],
+        'concat': ['axis'],
+        'empty': ['shape', 'dtype'],
+        'conj': [],
+        'norm': ['axis', 'epsilon', 'is_test'],
+        'histogram': ['bins', 'min', 'max'],
+        'dot': [],
+        'scale': ['scale', 'bias', 'bias_after_scale'],
+        'digamma': [],
+        'lerp': [],
+        'cast': ['out_dtype', 'in_dtype'],
+        'abs': []
+    }
+    attrs_args_ = ""
+    if len(kernel_attrs_names[op_name]) == len(attrs_info):
+        for index in range(len(attrs_info)):
+            attr_name = kernel_attrs_names[op_name][index]
+            attr_type = attr_type_converter[attrs_info[index]]
+            attrs_args_ += '{type_}:${name_},'.format(
+                type_=attr_type, name_=attr_name)
+    return attrs_args_[:-1]
+
+
+def generate_inputs_info(input_info):
+    input_args_ = ""
+    for index in range(len(input_info)):
+        [target_, layout_, precision_] = input_info[index].split(',')
+        # todo: check vadility
+        target_ = target_type_converter[target_.strip()]
+        layout_ = layout_type_converter[layout_.strip()]
+        precision_ = precision_type_converter[precision_.strip()]
+        input_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$in{},".format(
+            target_.strip(), precision_.strip(), layout_.strip(), str(index))
+    input_args_ = input_args_[:-1]
+    return input_args_
+
+
+def generate_arguments_info(op_name, input_info, attr_info):
+    input_args = generate_inputs_info(input_info)
+    attr_args = generate_attrs_info(op_name, attr_info)
+    context_args = "CPU_Context:$dev_ctx"
+    argument_ = "{},{},{}".format(context_args, input_args, attr_args)
+    return (("let arguments = (ins {});".format(argument_.strip(","))))
+
+
+def generate_results_info(output_info):
+    output_args_ = "let results = (outs "
+    for index in range(len(output_info)):
+        [target_, layout_, precision_] = output_info[index].split(',')
+        # todo: check vadility
+        target_ = target_type_converter[target_.strip()]
+        layout_ = layout_type_converter[layout_.strip()]
+        precision_ = precision_type_converter[precision_.strip()]
+        output_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$out{},".format(
+            target_.strip(), precision_.strip(), layout_.strip(), str(index))
+    return ("{});".format(output_args_[:-1]))
+
+
+def generate_supported_kernel_list(load_dict):
+    supported_kernels_list_ = []
+    for op_name in load_dict:
+        kernel_list = load_dict[op_name]
+        for kernel_info in kernel_list:
+            for kernel_alias_ in kernel_info:
+                attributes = kernel_info[kernel_alias_]["attribute"]
+                flag = True
+                for attribute in attributes:
+                    if attribute not in attr_type_converter:
+                        flag = False
+                if flag:
+                    supported_kernels_list_.append(op_name)
+
+                alias_ = generate_kernel_dialect(op_name, kernel_alias_,
+                                                 kernel_info[kernel_alias_])
+    supported_kernels_list_ = list(set(supported_kernels_list_))
+    print(supported_kernels_list_)
+
+
+def scan_kernel_info(load_dict):
+    target_type_ = []
+    layout_type_ = []
+    precision_type_ = []
+    for op_name in load_dict:
+        kernel_list = load_dict[op_name]
+        for kernel_info in kernel_list:
+            for kernel_alias_ in kernel_info:
+                [target_, layout_, precision_] = kernel_alias_[1:-1].split(',')
+                target_type_.append(target_.strip())
+                layout_type_.append(layout_.strip())
+                precision_type_.append(precision_.strip())
+    target_type_ = list(set(target_type_))
+    layout_type_ = list(set(layout_type_))
+    precision_type_ = list(set(precision_type_))
+    print(target_type_)
+    print(layout_type_)
+    print(precision_type_)
+
+
+def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
+
+    alias = generate_kernel_name(op_name, kernel_alias_)
+    summary = 'let summary = "{name}";'.format(name=alias)
+    dialect_name = alias.split(".")
+    dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
+        3]
+
+    header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
+        kernel_name=alias.replace(".", ""),
+        name=dialect_name.lower(),
+        left_brace="{")
+
+    inputs_ = kernel_info["input"]
+    attributes = kernel_info["attribute"]
+    arguments = generate_arguments_info(op_name, inputs_, attributes)
+
+    outputs = kernel_info["output"]
+    results = generate_results_info(outputs)
+
+    kernel_dialect = '{header_}\n  {summary_}\n  {arguments_}\n  {results_}\n{right_brace}\n'.format(
+        header_=header,
+        summary_=summary,
+        arguments_=arguments,
+        results_=results,
+        right_brace="}")
+    return kernel_dialect
+
+
+def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
+
+    alias = generate_kernel_name(op_name, kernel_alias_)
+    summary = 'let summary = "{name}";'.format(name=alias)
+    dialect_name = alias.split(".")
+    dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
+        3]
+
+    header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
+        kernel_name=alias.replace(".", ""),
+        name=dialect_name.lower(),
+        left_brace="{")
+    inputs_ = kernel_info["input"]
+    attributes = kernel_info["attribute"]
+    arguments = generate_arguments_info(op_name, inputs_, attributes)
+
+    outputs = kernel_info["output"]
+    results = generate_results_info(outputs)
+
+    kernel_dialect = '{header_}\n  {summary_}\n  {arguments_}\n  {results_}\n{right_brace}\n'.format(
+        header_=header,
+        summary_=summary,
+        arguments_=arguments,
+        results_=results,
+        right_brace="}")
+    return kernel_dialect
+
+
+def generate_dialect_head():
+    comment_ = "/*===- TableGen'source file -----------------------------------------------===*\\\n\
+|*                                                                            *|\n\
+|* Kernel Definitions                                                         *|\n\
+|*                                                                            *|\n\
+|* Automatically generated file, do not edit!                                 *|\n\
+|* Generated by tools/infrt/generate_pten_kernel_dialect.py                   *|\n\
+|*                                                                            *|\n\
+\*===----------------------------------------------------------------------===*/\n"
+
+    includes_ = "#ifndef PTEN_KERNELS\n\
+#define PTEN_KERNELS\n\
+include \"mlir/Interfaces/InferTypeOpInterface.td\"\n\
+include \"mlir/Interfaces/LoopLikeInterface.td\"\n\
+include \"mlir/IR/OpBase.td\"\n\
+include \"paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td\""
+
+    return (comment_ + includes_)
+
+
+def get_kernel_target(kernel_alias_):
+    target = kernel_alias_[1:-1].split(",")
+    return target[0]
+
+
+def main(path_):
+    with open(path_, "r") as f:
+        load_dict = json.load(f)
+
+        head = generate_dialect_head()
+
+        cpu_registry_ = ""
+        gpu_registry_ = ""
+        for op_name in load_dict:
+            if op_name not in supported_kernels:
+                continue
+            kernel_list = load_dict[op_name]
+            for kernel_info in kernel_list:
+                for kernel_alias_ in kernel_info:
+                    if get_kernel_target(kernel_alias_) == "CPU":
+                        kernel_registry = generate_cpu_kernel_dialect(
+                            op_name, kernel_alias_, kernel_info[kernel_alias_])
+                        cpu_registry_ += kernel_registry
+                    elif get_kernel_target(kernel_alias_) == "GPU":
+                        kernel_registry = generate_gpu_kernel_dialect(
+                            op_name, kernel_alias_, kernel_info[kernel_alias_])
+                        gpu_registry_ += kernel_registry
+                    else:
+                        print("Unsupported backend:" + get_kernel_target(
+                            kernel_alias_))
+        end = "#endif  // PTEN_KERNELS"
+        with open("../../paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td",
+                  "w") as dst:
+            dst.write('{start_}\n{dialect_}\n{end_}'.format(
+                start_=head, dialect_=cpu_registry_, end_=end))
+        with open("../../paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td",
+                  "w") as dst:
+            dst.write('{start_}\n{dialect_}\n{end_}'.format(
+                start_=head, dialect_=gpu_registry_, end_=end))
+
+
+if __name__ == '__main__':
+    path = sys.argv[1]
+    main(path)
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index f3e9f345da2..9ea3fef0030 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -150,19 +150,19 @@ def gen_dtype(vals: List[str]):
     ir_dtypes, origin_dtypes = [], []
     for val in vals:
         if val == "float":
-            ir_dtypes.append("fp32")
+            ir_dtypes.append("float32")
             origin_dtypes.append("float")
         elif val == "double":
-            ir_dtypes.append("fp64")
+            ir_dtypes.append("float64")
             origin_dtypes.append("double")
         elif val == "float16":
-            ir_dtypes.append("fp16")
+            ir_dtypes.append("float16")
             origin_dtypes.append("paddle::experimental::float16")
         elif val == "bfloat16":
             ir_dtypes.append("bf16")
             origin_dtypes.append("paddle::experimental::bfloat16")
         elif val == "bool":
-            ir_dtypes.append("int1")
+            ir_dtypes.append("bool")
             origin_dtypes.append("bool")
         elif val == "int8_t":
             ir_dtypes.append("int8")
@@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = 'pten.' + '.'.join(
-                [it.lower() for it in update_item[:3]]) + "." + ir_dtype
+            ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[
+                2].lower() + '.' + ir_dtype
             res += f"""
   registry->AddKernel("{ir_name}","""
 
-- 
GitLab


From f30b3f810d1b7e341507450313503cf4702f7d8a Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 2 Mar 2022 16:17:43 +0800
Subject: [PATCH 056/272] support checking `phi` directory in CI op benchmark
 (#40026)

* support phi checking in CI op benchmark

* add sparse/gpu

* remove h file in cpu directory
---
 tools/ci_op_benchmark.sh | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 1db79418b2d..0937ebe5343 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -43,20 +43,33 @@ function match_cu_file_directory {
   do
     [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0
   done
-  for sub_dir in "" "/gpu" "/hybird"
+  for sub_dir in "" "/gpu" "/gpudnn" "/sparse/gpu"
   do
     [ "${cu_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 0
   done
   return 1
 }
 
+# Limit h file directory
+function match_h_file_directory {
+  LOG "[INFO] run function match_h_file_directory"
+  local sub_dir h_file_dir
+  h_file_dir=$(dirname ${1})
+  # '.h' file should not in directory below
+  for sub_dir in "" "/cpu"
+  do
+    [ "${h_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 1
+  done
+  return 0
+}
+
 # Load op files by header file
 function load_CHANGE_OP_FILES_by_header_file {
   LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file"
   local change_file
   for change_file in $(grep -rl "${1}" paddle/fluid/operators paddle/phi/kernels/)
   do
-    if [[ "$change_file" =~ "_op.cu" ]]
+    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" ||  "$change_file" =~ "_kernel_gpudnn.cu" ]]
     then
       # match cu file directory limit
       match_cu_file_directory $change_file || continue
@@ -64,6 +77,7 @@ function load_CHANGE_OP_FILES_by_header_file {
       CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
     elif [[ "$change_file" =~ ".h" ]]
     then
+      match_h_file_directory $change_file || continue
       [ -n "${INCLUDE_SEARCH_MAP[$change_file]}" ] && continue
       LOG "[INFO] Found \"${1}\" include by \"${change_file}\", keep searching."
       INCLUDE_SEARCH_MAP[$change_file]="searched"
@@ -82,7 +96,7 @@ function load_CHANGE_OP_FILES {
     # match directory limit
     [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/phi/kernels/" ]]  || continue
     # match file name limit
-    if [[ "$change_file" =~ "_op.cu" ]]
+    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]]
     then
       # match cu file directory limit
       match_cu_file_directory $change_file || continue
@@ -90,6 +104,7 @@ function load_CHANGE_OP_FILES {
       CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
     elif [[ "$change_file" =~ ".h" ]]
     then
+      match_h_file_directory $change_file || continue
       LOG "[INFO] Found \"${change_file}\" changed, keep searching."
       INCLUDE_SEARCH_MAP[${change_file}]="searched"
       load_CHANGE_OP_FILES_by_header_file $change_file
@@ -131,6 +146,8 @@ function load_CHANGE_OP_MAP {
       op_name=${change_file_name##*/}
       op_name=${op_name%_cudnn_op*}
       op_name=${op_name%_op*}
+      op_name=${op_name%_grad_kernel*}
+      op_name=${op_name%_kernel*}
       [ -n "${SKIP_OP_MAP[$op_name]}" ] && continue
       LOG "[INFO] Load op: \"${op_name}\"."
       CHANGE_OP_MAP[${op_name}]="$change_file"
-- 
GitLab


From 1c4e3e5dd0d32a4216bdad0b1cafcab4ca5ed5bb Mon Sep 17 00:00:00 2001
From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com>
Date: Wed, 2 Mar 2022 16:23:52 +0800
Subject: [PATCH 057/272] new fleet_desc builder (#39948)

* delete gloo connect retry

* the_one_ps dirs reconstruct

* .

* .

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* refactor ps optimize

* refactor ps optimize

* refactor ps optimize

* .

* .

* .

* .

* .

* .

* refactor theoneps

* the_one_ps

* add ps pass unittest

* add ps pass unittest

* ps unitest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* ps unittest ready

* ps unittest ready

* solve dist_pass init conflict

* solve import CommContext error

* unittest ok

* implement AllocateFrom

* solve setup.py.in conflict

* solve conflict

* solve conflict

* solve conflict

* .

* .

* cpu-async-ps minimize test ok & gpu minimize test ok

* add heter 2stage unittest

* add heter 2stage unittest

* add heter 2stage unittest

* sync/geo test ok & fix heter_worker program ok

* .

* new fleet desc generator

* new fleet_desc builder

* new fleet_desc builder

* .

* .

* correct ps.proto compile

* .

Co-authored-by: zkh2016 <zhangkaihuo@baidu.com>
---
 paddle/fluid/distributed/ps/ps.proto          |   13 -
 paddle/fluid/framework/CMakeLists.txt         |    5 +-
 paddle/fluid/framework/ps.proto               |  213 ++++
 .../fleet/meta_optimizers/ps_optimizer.py     |    1 +
 python/paddle/distributed/ps/README.md        |    3 -
 python/paddle/distributed/ps/the_one_ps.py    | 1022 ++++++++---------
 .../paddle/distributed/ps/utils/ps_factory.py |    4 +-
 .../ps/utils/ps_program_builder.py            |    5 +-
 python/paddle/distributed/ps/utils/public.py  |    4 +-
 .../fluid/tests/unittests/CMakeLists.txt      |    2 +-
 .../distributed_passes/ps_pass_test_base.py   |   54 +-
 .../test_ps_trainer_pass.py                   |  122 +-
 .../fluid/tests/unittests/ps/CMakeLists.txt   |    4 +-
 .../tests/unittests/ps/ps_dnn_trainer.py      |   86 +-
 .../tests/unittests/ps/test_the_one_ps.py     |   92 +-
 .../fluid/tests/unittests/ps_dnn_model.py     |    1 +
 16 files changed, 961 insertions(+), 670 deletions(-)
 delete mode 100755 paddle/fluid/distributed/ps/ps.proto
 mode change 100644 => 100755 paddle/fluid/framework/CMakeLists.txt
 create mode 100755 paddle/fluid/framework/ps.proto
 delete mode 100755 python/paddle/distributed/ps/README.md
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/CMakeLists.txt
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py

diff --git a/paddle/fluid/distributed/ps/ps.proto b/paddle/fluid/distributed/ps/ps.proto
deleted file mode 100755
index 2691f637527..00000000000
--- a/paddle/fluid/distributed/ps/ps.proto
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
old mode 100644
new mode 100755
index 14aecb5fd43..02d90b9c6da
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -235,6 +235,7 @@ if(WITH_PYTHON)
   py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
   py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
+  py_proto_compile(ps_py_proto SRCS ps.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
   add_custom_target(fleet_proto_init ALL  
@@ -242,12 +243,13 @@ if(WITH_PYTHON)
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
   )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
@@ -259,6 +261,7 @@ if(WITH_PYTHON)
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
           COMMAND copy /Y *.py ${proto_dstpath}
+      COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
 	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
diff --git a/paddle/fluid/framework/ps.proto b/paddle/fluid/framework/ps.proto
new file mode 100755
index 00000000000..0ae87812bce
--- /dev/null
+++ b/paddle/fluid/framework/ps.proto
@@ -0,0 +1,213 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+option cc_generic_services = true;
+option cc_enable_arenas = true;
+
+message FsClientParameter {
+  enum FsApiType {
+    HDFS = 0;
+    AFS = 1;
+  }
+  optional FsApiType fs_type = 1 [ default = HDFS ];
+  optional string uri = 2;        // such as afs://xxx.afs.com:9902
+  optional string user = 3;       // user_name to access fs
+  optional string passwd = 4;     // password
+  optional int32 buffer_size = 5; // buffer for read/write
+  optional string hadoop_bin = 51;
+  optional string afs_conf = 101;
+}
+
+message PSParameter {
+  optional string worker_class = 1;
+  optional string server_class = 2;
+  optional string instance_class = 3;
+  optional string init_gflags = 4 [ default = "" ];
+  optional WorkerParameter worker_param = 101;
+  optional ServerParameter server_param = 102;
+  repeated DownpourTrainerParameter trainer_param = 301;
+  optional FsClientParameter fs_client_param = 501;
+}
+
+message WorkerParameter {
+  optional DownpourWorkerParameter downpour_worker_param = 1;
+}
+
+message DownpourWorkerParameter {
+  repeated TableParameter downpour_table_param = 1;
+}
+
+message DownpourServerParameter {
+  repeated TableParameter downpour_table_param = 1;
+  optional ServerServiceParameter service_param = 2;
+}
+
+message ServerParameter {
+  optional DownpourServerParameter downpour_server_param = 1;
+}
+
+message DownpourTrainerParameter {
+  repeated DenseTableParameter dense_table = 1;
+  repeated SparseTableParameter sparse_table = 2;
+  optional int32 push_sparse_per_batch = 3;
+  optional int32 push_dense_per_batch = 4;
+  repeated string skip_op = 5;
+  repeated ProgramConfig program_config = 6;
+}
+
+message DenseTableParameter {
+  optional int32 table_id = 1;
+  repeated string dense_variable_name = 2;
+  repeated string dense_gradient_variable_name = 3;
+  optional int32 fea_dim = 4;
+}
+
+message SparseTableParameter {
+  optional int32 table_id = 1;
+  optional int32 feature_dim = 2;
+  repeated string slot_key = 3;
+  repeated string slot_value = 4;
+  repeated string slot_gradient = 5;
+}
+
+message ServerServiceParameter {
+  optional string server_class = 1 [ default = "BrpcPsServer" ];
+  optional string client_class = 2 [ default = "BrpcPsClient" ];
+  optional string service_class = 3 [ default = "BrpcPsService" ];
+  optional uint32 start_server_port = 4
+      [ default = 0 ]; // will find a avaliable port from it
+  optional uint32 server_thread_num = 5 [ default = 12 ];
+}
+
+message ProgramConfig {
+  required string program_id = 1;
+  repeated int32 push_sparse_table_id = 2;
+  repeated int32 push_dense_table_id = 3;
+  repeated int32 pull_sparse_table_id = 4;
+  repeated int32 pull_dense_table_id = 5;
+}
+
+enum TableType {
+  PS_SPARSE_TABLE = 0;
+  PS_DENSE_TABLE = 1;
+  PS_OTHER_TABLE = 2;
+}
+
+message TableParameter {
+  optional uint64 table_id = 1;
+  optional string table_class = 2;
+  optional uint64 shard_num = 3 [ default = 1000 ];
+  optional TableAccessorParameter accessor = 4;
+  optional TensorAccessorParameter tensor = 5;
+  optional CommonAccessorParameter common = 6;
+  optional TableType type = 7;
+  optional bool compress_in_save = 8 [ default = false ];
+}
+
+message TableAccessorParameter {
+  optional string accessor_class = 1;
+  optional uint32 fea_dim = 4 [ default = 11 ];
+  optional uint32 embedx_dim = 5 [ default = 8 ];
+  optional uint32 embedx_threshold = 6 [ default = 10 ];
+  optional CtrAccessorParameter ctr_accessor_param = 7;
+  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
+  optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
+  optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
+}
+
+message CtrAccessorParameter {
+  optional float nonclk_coeff = 1
+      [ default = 0.1 ]; // to calculate show_click_score
+  optional float click_coeff = 2
+      [ default = 1 ]; // to calculate show_click_score
+  optional float base_threshold = 3 [
+    default = 1.5
+  ]; // show_click_score > base_threshold, this feature can be saved
+  optional float delta_threshold = 4
+      [ default =
+            0.25 ]; // delta_score > delta_threshold, this feature can be saved
+  optional float delta_keep_days = 5
+      [ default =
+            16 ]; // unseen_day < delta_keep_days, this feature can be saved
+  optional float show_click_decay_rate = 6 [
+    default = 0.98
+  ]; // show/click will update to show/click * show_click_decay_rate after a day
+  optional float delete_threshold = 7
+      [ default = 0.8 ]; // threshold to shrink a feasign
+  optional float delete_after_unseen_days = 8
+      [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature
+                        // will be delete in shrink_model
+  optional int32 ssd_unseenday_threshold = 9
+      [ default = 1 ]; // threshold to save ssd
+}
+
+message TensorAccessorParameter {
+  optional string feed_var_name = 1;
+  optional string fetch_var_name = 2;
+  optional int64 startup_program_id = 3;
+  optional int64 main_program_id = 4;
+  optional string tensor_table_class = 6;
+}
+
+message CommonAccessorParameter {
+  optional string name = 1;
+  optional string table_name = 2;
+  repeated string attributes = 3;
+  repeated string params = 4;
+  repeated uint32 dims = 5;
+  repeated string initializers = 6;
+  optional string entry = 7;
+  optional int32 trainer_num = 8;
+  optional bool sync = 9;
+  optional uint32 table_num = 10;
+  optional uint32 table_dim = 11;
+}
+
+message TableAccessorSaveParameter {
+  optional uint32 param = 1;
+  optional string converter = 2;
+  optional string deconverter = 3;
+}
+
+message SparseCommonSGDRuleParameter {
+  optional string name = 1;
+  optional SparseNaiveSGDRuleParameter naive = 2;
+  optional SparseAdagradSGDRuleParameter adagrad = 3;
+  optional SparseAdamSGDParameter adam = 4;
+}
+
+message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  repeated float weight_bounds = 3;
+}
+
+message
+    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_g2sum = 2 [ default = 3.0 ];
+  optional double initial_range = 3 [ default = 0.0001 ];
+  repeated float weight_bounds = 4;
+}
+
+message SparseAdamSGDParameter { // SparseAdamSGDRule
+  optional double learning_rate = 1 [ default = 0.001 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  optional double beta1_decay_rate = 3 [ default = 0.9 ];
+  optional double beta2_decay_rate = 4 [ default = 0.999 ];
+  optional double ada_epsilon = 5 [ default = 1e-08 ];
+  repeated float weight_bounds = 6;
+}
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 100a6882b1b..00937dbe7a4 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -54,6 +54,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
 
         attrs['user_defined_strategy'] = self.user_defined_strategy
+        attrs['valid_strategy'] = self.user_defined_strategy
         attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy)
         attrs['ps_mode'] = attrs['trainer'].mode
         logger.info("ps_mode: {}".format(attrs['ps_mode']))
diff --git a/python/paddle/distributed/ps/README.md b/python/paddle/distributed/ps/README.md
deleted file mode 100755
index 8d28031794f..00000000000
--- a/python/paddle/distributed/ps/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# 目录说明
-
-* 改完之后，上层目录中 fleet 中相关文件（夹）就可以删除
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 14a68ad9167..cc744bc9d9e 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -15,10 +15,11 @@
 import warnings
 
 import os
+from paddle.distributed.fleet.proto import ps_pb2
 import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
 from paddle.fluid import core
-from .utils.public import *
+from paddle.distributed.ps.utils.public import *
 from paddle.fluid.framework import Program
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
@@ -29,14 +30,10 @@ from paddle.distributed.fleet.base.private_helper_function import wait_server_re
 from paddle.fluid.communicator import Communicator, HeterClient
 from google.protobuf import text_format
 
-__all__ = []
-
-
-def conv_indent(indent):
-    return "".join([" "] * indent)
-
-
-PSERVER_SAVE_SUFFIX = ".shard"
+__all__ = [
+    'Table', 'SparseTable', 'GeoSparseTable', 'BarrierTable', 'TensorTable',
+    'DenseTable'
+]
 
 
 def get_program_by_id(context, program_id):
@@ -62,129 +59,140 @@ def parse_table_class(varname, program_id, context):
                 return "MemorySparseTable"
 
 
-def get_default_accessor_proto(accessor, varname, program_id, context):
+def check_embedding_dim(accessor_proto, varname, program_id, context):
     main_program, startup_program = get_program_by_id(context, program_id)
     embedding_dim = 0
     for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
+            print('new var: {}, {}, {}'.format(var, embedding_dim,
+                                               accessor_proto.fea_dim))
             break
-
-    if not accessor.HasField("accessor_class"):
-        accessor.accessor_class = "CtrCommonAccessor"
-    if not accessor.HasField("fea_dim"):
-        accessor.fea_dim = embedding_dim + 2
-    if not accessor.HasField("embedx_dim"):
-        accessor.embedx_dim = embedding_dim - 1
-    if not accessor.HasField("embedx_threshold"):
-        accessor.embedx_threshold = 0
-
-    ctr_accessor_param = accessor.ctr_accessor_param
-    if not ctr_accessor_param.HasField("nonclk_coeff"):
-        ctr_accessor_param.nonclk_coeff = 0.1
-    if not ctr_accessor_param.HasField("click_coeff"):
-        ctr_accessor_param.click_coeff = 1.0
-    if not ctr_accessor_param.HasField("base_threshold"):
-        ctr_accessor_param.base_threshold = 0
-    if not ctr_accessor_param.HasField("delta_threshold"):
-        ctr_accessor_param.delta_threshold = 0
-    if not ctr_accessor_param.HasField("delta_keep_days"):
-        ctr_accessor_param.delta_keep_days = 16
-    if not ctr_accessor_param.HasField("show_click_decay_rate"):
-        ctr_accessor_param.show_click_decay_rate = 1
-    if not ctr_accessor_param.HasField("delete_threshold"):
-        ctr_accessor_param.delete_threshold = 0
-    if not ctr_accessor_param.HasField("delete_after_unseen_days"):
-        ctr_accessor_param.delete_after_unseen_days = 30
-    if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
-        ctr_accessor_param.ssd_unseenday_threshold = 1
-
-    for sgd_param in [accessor.embed_sgd_param, accessor.embedx_sgd_param]:
-        if not sgd_param.HasField("name"):
-            sgd_param.name = "SparseAdaGradSGDRule"
-        if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
-            if not sgd_param.adagrad.HasField("learning_rate"):
-                sgd_param.adagrad.learning_rate = 0.05
-            if not sgd_param.adagrad.HasField("initial_g2sum"):
-                sgd_param.adagrad.initial_g2sum = 3.0
-            if not sgd_param.adagrad.HasField("initial_range"):
-                sgd_param.adagrad.initial_range = 0.0001
-            if len(sgd_param.adagrad.weight_bounds) == 0:
-                sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
-        if sgd_param.name == "SparseNaiveSGDRule":
-            if not sgd_param.naive.HasField("learning_rate"):
-                sgd_param.naive.learning_rate = 0.05
-            if not sgd_param.naive.HasField("initial_range"):
-                sgd_param.naive.initial_range = 0.0001
-            if len(sgd_param.naive.weight_bounds) == 0:
-                sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
-        if sgd_param.name == "SparseAdamSGDRule":
-            if not sgd_param.adam.HasField("learning_rate"):
-                sgd_param.adam.learning_rate = 0.001
-            if not sgd_param.adam.HasField("initial_range"):
-                sgd_param.adam.initial_range = 0.0001
-            if not sgd_param.adam.HasField("beta1_decay_rate"):
-                sgd_param.adam.beta1_decay_rate = 0.9
-            if not sgd_param.adam.HasField("beta2_decay_rate"):
-                sgd_param.adam.beta2_decay_rate = 0.999
-            if not sgd_param.adam.HasField("ada_epsilon"):
-                sgd_param.adam.ada_epsilon = 1e-08
-            if len(sgd_param.adam.weight_bounds) == 0:
-                sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
-
-
-def check_embedding_dim(accessor, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
-    embedding_dim = 0
-    for var in main_program.list_vars():
-        if var.name == varname:
-            embedding_dim = var.shape[1]
-            break
-    fea_dim = accessor.fea_dim
+    fea_dim = accessor_proto.fea_dim
     if fea_dim != embedding_dim + 2:
         raise ValueError(
             "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
             format(embedding_dim + 2, fea_dim))
-    embedx_dim = accessor.embedx_dim
+    embedx_dim = accessor_proto.embedx_dim
     if embedx_dim != embedding_dim - 1:
         raise ValueError(
             "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
             format(embedding_dim - 1, embedx_dim))
 
 
+class Service:
+    def __init__(self):
+        pass
+
+    def _set(self, service_proto):
+        service_proto.server_class = "BrpcPsServer"
+        service_proto.client_class = "BrpcPsClient"
+        service_proto.service_class = "BrpcPsService"
+        service_proto.start_server_port = 0
+        service_proto.server_thread_num = 12
+
+
+class GpuService(Service):
+    def __init__(self):
+        super(GpuService).__init__(self)
+
+    def _set(self, service_proto):
+        super(GpuService)._set(service_proto)
+        service_proto.server_class = 'PsLocalServer'
+        service_proto.client_class = 'PsLocalClient'
+
+
 class Accessor:
     def __init__(self):
         self.accessor_class = ""
         self.optimizer = None
-        self.feature_dim = -1
-        self.embedding_dim = -1
-        self.optimizer = None
-
-    def to_string(self, indent):
-        accessor_str = "{}accessor {{{}\n{}}}"
-        attrs = ""
-        attrs += "accessor_class: \"{}\" ".format(self.accessor_class)
-        attrs += "fea_dim: {} ".format(self.feature_dim)
-        attrs += "embedx_dim: {} ".format(self.embedding_dim)
-        attrs += "\n"
-        if self.optimizer is not None:
-            attrs += self.optimizer.to_string(indent)
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+        self.feature_dim = 0
+        self.embedding_dim = 0
 
+    # TableAccessorParameter accessor
+    def _set(self, accessor_proto, varname, program_id, context):
+        main_program, startup_program = get_program_by_id(context, program_id)
+        embedding_dim = 0
+        for var in main_program.list_vars():
+            if var.name == varname:
+                embedding_dim = var.shape[1]
+                break
 
-class CommonAccessor:
+        if not accessor_proto.HasField("accessor_class"):
+            accessor_proto.accessor_class = "CtrCommonAccessor"
+        if not accessor_proto.HasField("fea_dim"):
+            accessor_proto.fea_dim = embedding_dim + 2
+        if not accessor_proto.HasField("embedx_dim"):
+            accessor_proto.embedx_dim = embedding_dim - 1
+        if not accessor_proto.HasField("embedx_threshold"):
+            accessor_proto.embedx_threshold = 0
+
+        ctr_accessor_param = accessor_proto.ctr_accessor_param
+        if not ctr_accessor_param.HasField("nonclk_coeff"):
+            ctr_accessor_param.nonclk_coeff = 0.1
+        if not ctr_accessor_param.HasField("click_coeff"):
+            ctr_accessor_param.click_coeff = 1.0
+        if not ctr_accessor_param.HasField("base_threshold"):
+            ctr_accessor_param.base_threshold = 0
+        if not ctr_accessor_param.HasField("delta_threshold"):
+            ctr_accessor_param.delta_threshold = 0
+        if not ctr_accessor_param.HasField("delta_keep_days"):
+            ctr_accessor_param.delta_keep_days = 16
+        if not ctr_accessor_param.HasField("show_click_decay_rate"):
+            ctr_accessor_param.show_click_decay_rate = 1
+        if not ctr_accessor_param.HasField("delete_threshold"):
+            ctr_accessor_param.delete_threshold = 0
+        if not ctr_accessor_param.HasField("delete_after_unseen_days"):
+            ctr_accessor_param.delete_after_unseen_days = 30
+        if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
+            ctr_accessor_param.ssd_unseenday_threshold = 1
+
+        for sgd_param in [
+                accessor_proto.embed_sgd_param, accessor_proto.embedx_sgd_param
+        ]:
+            if not sgd_param.HasField("name"):
+                sgd_param.name = "SparseAdaGradSGDRule"
+            if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
+                if not sgd_param.adagrad.HasField("learning_rate"):
+                    sgd_param.adagrad.learning_rate = 0.05
+                if not sgd_param.adagrad.HasField("initial_g2sum"):
+                    sgd_param.adagrad.initial_g2sum = 3.0
+                if not sgd_param.adagrad.HasField("initial_range"):
+                    sgd_param.adagrad.initial_range = 0.0001
+                if len(sgd_param.adagrad.weight_bounds) == 0:
+                    sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
+            if sgd_param.name == "SparseNaiveSGDRule":
+                if not sgd_param.naive.HasField("learning_rate"):
+                    sgd_param.naive.learning_rate = 0.05
+                if not sgd_param.naive.HasField("initial_range"):
+                    sgd_param.naive.initial_range = 0.0001
+                if len(sgd_param.naive.weight_bounds) == 0:
+                    sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
+            if sgd_param.name == "SparseAdamSGDRule":
+                if not sgd_param.adam.HasField("learning_rate"):
+                    sgd_param.adam.learning_rate = 0.001
+                if not sgd_param.adam.HasField("initial_range"):
+                    sgd_param.adam.initial_range = 0.0001
+                if not sgd_param.adam.HasField("beta1_decay_rate"):
+                    sgd_param.adam.beta1_decay_rate = 0.9
+                if not sgd_param.adam.HasField("beta2_decay_rate"):
+                    sgd_param.adam.beta2_decay_rate = 0.999
+                if not sgd_param.adam.HasField("ada_epsilon"):
+                    sgd_param.adam.ada_epsilon = 1e-08
+                if len(sgd_param.adam.weight_bounds) == 0:
+                    sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
+
+
+class CommonAccessor(Accessor):
     def __init__(self):
-        self.accessor_class = ""
-        self.table_name = None
-        self.entry = None
+        super(CommonAccessor, self).__init__()
+        self.table_name = ''
+        self.entry = 'none'
         self.attrs = []
         self.params = []
         self.dims = []
         self.trainer_num = 0
-        self.sync = "false"
-        self.table_num = None
-        self.table_dim = None
+        self.sync = False
         self.initializers = []
         self.opt_input_map = {}
         self.opt_attr_map = {}
@@ -422,233 +430,361 @@ class CommonAccessor:
         self.initializers = initializers
         self.attrs = attrs
 
-    def to_string(self, indent):
-        accessor_str = "{}common {{{}\n{}}}"
-        attrs = ""
-        attrs += "name: \"{}\" ".format(self.accessor_class)
-
-        if self.table_name:
-            attrs += "table_name: \"{}\" ".format(self.table_name)
-
-        if self.entry:
-            attrs += "entry: \"{}\" ".format(self.entry)
-        attrs += "trainer_num: {} ".format(self.trainer_num)
-        attrs += "sync: {} ".format(self.sync)
-        if self.table_num:
-            attrs += "table_num: {} ".format(self.table_num)
-        if self.table_dim:
-            attrs += "table_dim: {} ".format(self.table_dim)
-
-        for param in self.params:
-            attrs += "params: \"{}\" ".format(param)
-
-        for dim in self.dims:
-            attrs += "dims: {} ".format(dim)
-
-        for initializer in self.initializers:
-            attrs += "initializers: \"{}\" ".format(initializer)
-
-        attrs += "\n"
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    # CommonAccessorParameter common
+    def _set(self, proto):
+        proto.name = self.accessor_class
+        proto.table_name = self.table_name
+        proto.params.extend(self.params)
+        proto.dims.extend(self.dims)
+        proto.initializers.extend(self.initializers)
+        proto.entry = self.entry
+        proto.trainer_num = self.trainer_num
+        proto.sync = self.sync
+        proto.table_num = self.table_num
+        proto.table_dim = self.table_dim
 
 
 class Tensor:
-    def __init__(self):
-        self.main_program_id = None
-        self.startup_program_id = None
-        self.feed_var_name = None
-        self.fetch_var_name = None
-        self.tensor_table_class = False
-
-    def to_string(self, indent):
-        program_str = "{}tensor {{{}\n{}}}"
-        attrs = ""
-        attrs += "feed_var_name: \"{}\" ".format(str(self.feed_var_name))
-        attrs += "fetch_var_name: \"{}\" ".format(str(self.fetch_var_name))
-        attrs += "startup_program_id: {} ".format(str(self.startup_program_id))
-        attrs += "main_program_id: {} ".format(str(self.main_program_id))
-        attrs += "tensor_table_class: \"{}\" ".format(
-            str(self.tensor_table_class))
-        attrs += "\n"
-        return program_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    def __init__(self, tesnor_dcit):
+        self.tensor_dict = tesnor_dcit
+
+    def _set(self, tensor_proto):
+        tensor_proto.main_program_id = self.tensor_dict.get("main_program_id",
+                                                            0)
+        tensor_proto.startup_program_id = self.tensor_dict.get(
+            "startup_program_id", 0)
+        tensor_proto.feed_var_name = self.tensor_dict.get("feed_var_name", '')
+        tensor_proto.fetch_var_name = self.tensor_dict.get("fetch_var_name", '')
+        tensor_proto.tensor_table_class = self.tensor_dict.get(
+            "tensor_table_class", '')
 
 
 class Table:
     def __init__(self):
-        self.id = -1
         self.table_class = None
         self.shard_num = -1
         self.type = None
-        self.accessor = None
-        self.common = None
+        self.accessor = Accessor()
+        self.shard_num = 256
+        self.common = CommonAccessor()
         self.tensor = None
-        self.accessor_proto = None
-
-    def to_string(self, indent):
-        # if self.id == 1:
-        #     proto_txt = ''
-        #     with open('./sparse_table.prototxt') as f:
-        #         proto_txt = f.read()
-        #     return proto_txt
-        table_str = "{}downpour_table_param {{{}\n{}}}"
-
-        attrs = ""
-        attrs += "table_id: {} ".format(self.id)
-        attrs += "table_class: \"{}\" ".format(self.table_class)
-        attrs += "shard_num: {} ".format(self.shard_num)
-        attrs += "type: {}".format(self.type)
-        attrs += "\n"
-        indent += 2
-
-        if self.accessor_proto is not None:
-            accessor_str = "{}accessor {{{}\n{}}}"
-            accessor_str = accessor_str.format(
-                conv_indent(indent), self.accessor_proto, conv_indent(indent))
-            attrs += accessor_str + "\n"
-        elif self.accessor is not None:
-            attrs += self.accessor.to_string(indent)
-            attrs += "\n"
-
-        if self.tensor is not None:
-            attrs += self.tensor.to_string(indent)
-            attrs += "\n"
-
-        if self.common is not None:
-            attrs += self.common.to_string(indent)
-            attrs += "\n"
-
-        return table_str.format(conv_indent(indent), attrs, conv_indent(indent))
 
+    def _set(self, table_proto):
+        pass
 
-class Service:
-    def __init__(self):
-        self.server_class = "BrpcPsServer"
-        self.client_class = "BrpcPsClient"
-        self.service_class = "BrpcPsService"
-        self.start_server_port = 0
-        self.server_thread_num = 12
 
-    def to_string(self, indent):
-        service_str = "{}service_param {{{}\n{}}}"
+class BarrierTable(Table):
+    def __init__(self, context, idx):
+        super(BarrierTable, self).__init__()
+        self.type = None
+        self.shard_num = 256
+        self.accessor.accessor_class = 'CommMergeAccessor'
+        self.common.attrs = ""
+        self.common.dims = []
+        self.common.params = []
+        self.is_heter_ps_mode = context['is_heter_ps_mode']
+        self.role_maker = context['role_maker']
+        self.idx = idx
+        self.is_sync = context['is_sync']
+
+    def _set(self, table_proto):
+        table_proto.table_id = self.idx
+        table_proto.table_class = 'BarrierTable'
+        table_proto.shard_num = 256
+        table_proto.type = ps_pb2.PS_OTHER_TABLE
+
+        table_proto.accessor.accessor_class = "CommMergeAccessor"
+        table_proto.accessor.fea_dim = 0
+        table_proto.accessor.embedx_dim = 0
+
+        table_proto.common.name = ""
+        table_proto.common.table_name = "barrier_table"
+        table_proto.common.sync = self.is_sync
+        table_proto.common.entry = 'none'
+
+        trainer_num = get_trainers(self.role_maker)
+        if self.is_heter_ps_mode:
+            trainer_num += len(self.role_maker._get_heter_worker_endpoints())
+        table_proto.common.trainer_num = trainer_num
 
-        attrs = ""
-        attrs += "server_class: \"{}\" ".format(self.server_class)
-        attrs += "client_class: \"{}\" ".format(self.client_class)
-        attrs += "service_class: \"{}\" ".format(self.service_class)
-        attrs += "start_server_port: {} ".format(self.start_server_port)
-        attrs += "server_thread_num: {} ".format(self.server_thread_num)
 
-        return service_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+class TensorTable(Table):
+    def __init__(self, idx, tensor_dict, role_maker):
+        super(TensorTable, self).__init__()
+        self.idx = idx
+        self.tensor_dict = tensor_dict
+        self.role_maker = role_maker
 
+    def _set(self, table_proto):
+        table_proto.table_id = self.idx
+        table_proto.type = ps_pb2.PS_OTHER_TABLE
+        table_proto.table_class = self.tensor_dict.get("tensor_table_class", '')
 
-class DownpourServer:
-    def __init__(self):
-        self.service = None
-        self.tables = []
+        table_proto.accessor.accessor_class = "CommMergeAccessor"
 
-    def set_service_param(self, service):
-        self.service = service
+        table_proto.common.table_name = self.tensor_dict.get("feed_var_name",
+                                                             '')
+        table_proto.common.trainer_num = get_trainers(self.role_maker)
 
-    def append_tables(self, table):
-        if not isinstance(table, Table):
-            raise ValueError("only support instance Table")
-        self.tables.append(table)
+        tensor = Tensor(self.tensor_dict)
+        tensor._set(table_proto.tensor)
 
-    def to_string(self, indent):
-        server_str = "{}downpour_server_param {{{}\n{}}}"
 
-        table_strs = ""
-        indent += 2
+class SparseTable(Table):
+    def __init__(self, context, send_ctx):
+        super(SparseTable, self).__init__()
+        self.context = context
+        self.ctx = send_ctx
+        self.type = None
+        self.table_class = 'MemorySparseTable'
+        self.accessor = Accessor()
 
-        table_strs += "\n"
-        table_strs += self.service.to_string(indent)
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == False):
+            return
+        table_proto.table_id = ctx.table_id()
+        table_proto.table_class = self.table_class
+        table_proto.type = ps_pb2.PS_SPARSE_TABLE
+        table_proto.shard_num = self.shard_num
+
+        self.common.table_name = self.context['grad_name_to_param_name'][
+            ctx.origin_varnames()[0]]
+
+        print('new table_name: {}'.format(self.common.table_name))
+        all_table_proto = self.context[
+            "user_defined_strategy"].sparse_table_configs
+        usr_table_proto = all_table_proto.add()
+        for proto in all_table_proto:
+            if proto.table_name == self.common.table_name:
+                usr_table_proto = proto
+                break
+        table_proto.table_class = 'MemorySparseTable'
+        warnings.warn("The PS mode must use MemorySparseTable.")
+        if usr_table_proto.HasField("shard_num"):
+            table_proto.shard_num = usr_table_proto.shard_num
+        else:
+            table_proto.shard_num = 1000
+            warnings.warn(
+                "The shard_num of sparse table is not set, use default value 1000."
+            )
 
-        for table in self.tables:
-            table_strs += "\n"
-            table_strs += table.to_string(indent)
-        return server_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+        if usr_table_proto.accessor.ByteSize() == 0:
+            warnings.warn(
+                "The accessor of sparse table is not set, use default value.")
 
+        table_proto.accessor.ParseFromString(
+            usr_table_proto.accessor.SerializeToString())
+        self.accessor._set(table_proto.accessor, self.common.table_name,
+                           ctx.program_id(), self.context)
 
-class Server:
-    def __init__(self):
-        self.servers = []
+        check_embedding_dim(table_proto.accessor, self.common.table_name,
+                            ctx.program_id(), self.context)
 
-    def add_server(self, server):
-        if not isinstance(server, DownpourServer):
-            raise ValueError("only support instance DownpourServer")
-        self.servers.append(server)
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = True if self.context['is_sync'] else False
 
-    def __str__(self):
-        server_str = "server_param {{{}\n}}"
-        indent = 2
-        servers_str = ""
-        for server in self.servers:
-            servers_str += "\n"
-            servers_str += server.to_string(indent)
+        self.common._set(table_proto.common)
 
-        return server_str.format(servers_str)
 
+class GeoSparseTable(SparseTable):
+    def __init__(self, context, send_ctx):
+        super(GeoSparseTable, self).__init__(context, send_ctx)
+        self.table_class = "SparseGeoTable"
+        if self.context['ps_mode'] != DistributedMode.GEO:
+            raise ValueError("not geo sparse table!")
+
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == False):
+            return
+        table_proto.table_id = ctx.table_id()
+        table_proto.table_class = self.table_class
+        table_proto.type = ps_pb2.PS_SPARSE_TABLE
+        table_proto.shard_num = self.shard_num
+
+        table_proto.accessor.accessor_class = 'CommMergeAccessor'
+        table_proto.accessor.fea_dim = ctx.sections()[0]
+        table_proto.accessor.embedx_dim = ctx.sections()[1]
+
+        self.common.table_name = self.context['grad_name_to_param_name'][
+            ctx.origin_varnames()[0]]
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = False
+        self.common._set(table_proto.common)
+
+
+class DenseTable(Table):
+    def __init__(self, context, send_ctx):
+        super(DenseTable, self).__init__()
+        self.context = context
+        self.ctx = send_ctx
+        self.accessor = Accessor()
 
-class DownpourWorker:
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == True):
+            return
+
+        table_proto.table_id = ctx.table_id()
+
+        table_proto.type = ps_pb2.PS_DENSE_TABLE
+        table_proto.table_class = "CommonDenseTable"
+        table_proto.shard_num = 256
+
+        table_proto.accessor.accessor_class = 'CommMergeAccessor'
+        table_proto.accessor.fea_dim = ctx.sections()[0]
+        table_proto.accessor.embedx_dim = 1
+
+        self.common.table_name = "MergedDense"
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = True if self.context['is_sync'] else False
+
+        self.common._set(table_proto.common)
+
+
+class Server:
     def __init__(self):
-        self.tables = []
+        pass
 
-    def append_tables(self, table):
-        if not isinstance(table, Table):
-            raise ValueError("only support instance Table")
-        self.tables.append(table)
+    def _set(self):
+        pass
 
-    def to_string(self, indent):
-        worker_str = "{}downpour_worker_param {{{}\n{}}}"
-        table_strs = ""
-        indent += 2
-        for table in self.tables:
-            table_strs += "\n"
-            table_strs += table.to_string(indent)
 
-        return worker_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+class DownpourServer(Server):
+    def __init__(self):
+        super(DownpourServer, self).__init__()
+
+    def _set(self):
+        pass
 
 
 class Worker:
     def __init__(self):
-        self.workers = []
+        pass
 
-    def add_worker(self, worker):
-        if not isinstance(worker, DownpourWorker):
-            raise ValueError("only support instance DownpourWorker")
-        self.workers.append(worker)
+    def _set(self):
+        pass
 
-    def __str__(self):
-        worker_str = "worker_param {{{}\n}}"
-        indent = 2
-        workers_str = ""
-        for worker in self.workers:
-            workers_str += "\n"
-            workers_str += worker.to_string(indent)
 
-        return worker_str.format(workers_str)
+class DownpourWorker(Worker):
+    def __init__(self):
+        super(DownpourWorker, self).__init__()
+
+    def _set(self):
+        pass
 
 
 class fsClient:
-    def __init__(self, proto):
-        self.proto = proto
-        self.uri = proto.uri
-        self.user = proto.user
-        self.passwd = proto.passwd
-        self.hadoop_bin = proto.hadoop_bin
-
-    def to_string(self):
-        proto_txt = text_format.MessageToString(self.proto)
-        if proto_txt:
-            fs_str = "fs_client_param {{\n{}}}"
-            return fs_str.format(proto_txt)
+    def __init__(self, fs_client_param):
+        self.fs_client_param = fs_client_param
+
+    def _set(self, proto):
+        if not text_format.MessageToString(self.fs_client_param):
+            return
+        proto.uri = self.fs_client_param.uri
+        proto.user = self.fs_client_param.user
+        proto.passwd = self.fs_client_param.passwd
+        proto.hadoop_bin = self.fs_client_param.hadoop_bin
+
+
+class PsDescBuilder(object):
+    def __init__(self, context):
+        self.context = context
+        self.is_sync = context['is_sync']
+        self.ps_mode = context['ps_mode']
+        self.is_heter_ps_mode = context['is_heter_ps_mode']
+        self.use_ps_gpu = context['use_ps_gpu']
+        self.send_ctx = get_the_one_send_context(
+            self.context,
+            use_origin_program=True,
+            split_dense_table=self.is_heter_ps_mode)
+
+        self.tensor_table_dict = {}  # TODO
+        self._server_sub_program = []
+
+        self.tables = self._get_tables()
+
+        self.service = self._get_service()
+        self.fs_client = self._get_fs_client()
+
+        self.ps_desc = ps_pb2.PSParameter()
+
+    def _get_tensor_tables(self):
+        program_idx = 0
+        if not self.tensor_table_dict:
+            self._server_sub_program.append(Program().desc)
+        tables = []
+        for table_name in self.tensor_table_dict:
+            tables.append(globals()['TensorTable'](len(tables), tensor_dict,
+                                                   self.context['role_maker']))
+            program_idx += 1
+        return tables
+
+    def _get_tables(self):
+        tables = []
+        for idx, (name, ctx) in enumerate(self.send_ctx.items()):
+            print('####### {}\n'.format(ctx.is_sparse()))
+            if ctx.is_sparse():
+                if self.ps_mode == DistributedMode.GEO:
+                    tables.append(globals()['GeoSparseTable'](self.context,
+                                                              ctx))
+                else:
+                    tables.append(globals()['SparseTable'](self.context, ctx))
+            else:
+                tables.append(globals()['DenseTable'](self.context, ctx))
+        self.tensor_tables = self._get_tensor_tables()
+        tables.extend(self.tensor_tables)
+        tables.append(globals()['BarrierTable'](self.context, len(tables)))
+        return tables
+
+    def _get_service(self):
+        if self.use_ps_gpu:
+            return GpuService()
         else:
-            return ""
+            return Service()
+
+    def _get_fs_client(self):
+        return fsClient(self.context["user_defined_strategy"].fs_client_param)
+
+    def build_worker_desc(self):
+        for table in self.tables:
+            table_proto = self.ps_desc.worker_param.downpour_worker_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+        self.service._set(
+            self.ps_desc.server_param.downpour_server_param.service_param)
+        return text_format.MessageToString(self.ps_desc)
+
+    def build_server_desc(self):
+        for table in self.tables:
+            table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            self.sparse_table_maps = {}
+            if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None:
+                self.sparse_table_maps[
+                    table_proto.common.table_name] = table_proto.table_id
+
+        self.service._set(
+            self.ps_desc.server_param.downpour_server_param.service_param)
+        self.fs_client._set(self.ps_desc.fs_client_param)
+        return text_format.MessageToString(self.ps_desc)
 
 
 class TheOnePSRuntime(RuntimeBase):
@@ -665,8 +801,11 @@ class TheOnePSRuntime(RuntimeBase):
         self.role_maker = context["role_maker"]
 
         self.origin_main_program = context["origin_main_program"]
-        self.origin_main_programs = context["origin_main_programs"]
-
+        self.origin_main_programs = context.get("origin_main_programs",
+                                                [self.origin_main_program])
+        self.context["origin_main_programs"] = self.origin_main_programs
+        self.context["origin_startup_programs"] = context.get(
+            'origin_startup_programs', [context['origin_startup_program']])
         self.context[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
         self.is_heter_ps_mode = self.context['is_heter_ps_mode']
@@ -675,15 +814,23 @@ class TheOnePSRuntime(RuntimeBase):
         self.context['ps_mode'] = self.context['trainer'].mode
         self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[
             'use_ps_gpu']
-        self.is_sync = True if self.context[
+        self.context['is_sync'] = True if self.context[
             'ps_mode'] == DistributedMode.SYNC else False
         self.context['grad_name_to_param_name'] = {}
         self.context['tensor_table'] = {}
         build_var_distributed(self.context)
 
+        endpoints = get_ps_endpoints(self.role_maker)
+        self.string_hosts = []
+        for idx, ep in enumerate(endpoints):
+            host, port = ep.split(":")
+            pshost = fluid.core.PSHost(host, int(port), idx)
+            self.string_hosts.append(pshost.serialize_to_string())
+
+        self.ps_desc_builder = PsDescBuilder(self.context)
+
     def _init_worker(self):
-        worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync)
-        server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync)
+        worker_desc = self.ps_desc_builder.build_worker_desc()
 
         if self.context['use_ps_gpu']:
             main_program = self.context['loss'].block.program
@@ -701,23 +848,11 @@ class TheOnePSRuntime(RuntimeBase):
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = str(worker) + "\n" + str(server)
-        with open('proto_txt', 'w') as f:
-            f.write(proto_txt)
-
+        proto_txt = worker_desc + "\n" + server_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-
         if debug:
             print("worker: \n{}".format(proto_txt))
 
-        endpoints = get_ps_endpoints(self.role_maker)
-
-        string_hosts = []
-        for idx, ep in enumerate(endpoints):
-            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
-            string_hosts.append(pshost.serialize_to_string())
-
         dense_map = get_the_one_recv_context(
             self.context, split_dense_table=self.is_heter_ps_mode)
         send_ctx = get_the_one_send_context(
@@ -741,7 +876,7 @@ class TheOnePSRuntime(RuntimeBase):
         kwargs["trainer_id"] = self.role_maker._role_id()
         kwargs["trainers"] = self.role_maker._worker_num()
 
-        for table in server.servers[0].tables:
+        for table in server.servers[0].tables:  #TODO
             if table.table_class == "BarrierTable":
                 kwargs["barrier_table_id"] = table.id
                 break
@@ -755,7 +890,8 @@ class TheOnePSRuntime(RuntimeBase):
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
         self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
-                                         string_hosts, fluid.global_scope())
+                                         self.string_hosts,
+                                         fluid.global_scope())
 
         fleet.util.barrier()
         info = self._communicator.get_client_info()
@@ -812,275 +948,16 @@ class TheOnePSRuntime(RuntimeBase):
                                                  previous_trainers,
                                                  self.role_maker._role_id())
 
-    def _push_sparse_param(self,
-                           var_name,
-                           table_id=-1,
-                           scope=fluid.global_scope()):
-        self._communicator.push_sparse_param(var_name, table_id, scope)
-
-    def _get_executor(self):
-        executor = fluid.Executor(fluid.CPUPlace())
-        if self.is_heter_ps_mode:
-            if self.role_maker._is_heter_worker():
-                heter_device_type = self.role_maker._heter_device_type().upper()
-                if heter_device_type not in ["GPU", "XPU", "CPU"]:
-                    raise ValueError("Heter Worker Not Support Device {}".
-                                     format(device_type))
-                if heter_device_type == "GPU":
-                    executor = Executor(
-                        fluid.CUDAPlace(
-                            int(os.getenv("FLAGS_selected_gpus", "0"))))
-                elif heter_device_type == "XPU":
-                    executor = Executor(
-                        fluid.XPUPlace(
-                            int(os.getenv("FLAGS_selected_xpus", "0"))))
-        return executor
-
-    def _get_fleet_proto(self, is_server, is_sync, **kwargs):
-        def _build_merge_accessor(ctx):
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-
-            if ctx.is_sparse():
-                accessor.feature_dim = ctx.sections()[0]
-                accessor.embedding_dim = ctx.sections()[1]
-            else:
-                accessor.feature_dim = ctx.sections()[0]
-                accessor.embedding_dim = 1
-
-            return accessor
-
-        def _build_barrier_table(idx):
-            table = Table()
-            table.id = idx
-            table.type = "PS_OTHER_TABLE"
-            table.table_class = "BarrierTable"
-            table.shard_num = 256
-
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-            accessor.feature_dim = 0
-            accessor.embedding_dim = 0
-            table.accessor = accessor
-
-            common = CommonAccessor()
-            common.table_name = "barrier_table"
-            trainer_num = get_trainers(self.context['role_maker'])
-            if self.is_heter_ps_mode:
-                trainer_num += len(self.role_maker._get_heter_worker_endpoints(
-                ))
-            common.trainer_num = trainer_num
-            common.attrs = ""
-            common.dims = []
-            common.params = []
-            table.common = common
-            return table
-
-        def _build_tensor_table(idx, tensor_dict):
-            table = Table()
-            table.id = idx
-            table.type = "PS_OTHER_TABLE"
-            table.table_class = tensor_dict["tensor_table_class"]
-            table.shard_num = 256
-
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-            accessor.feature_dim = 0
-            accessor.embedding_dim = 0
-            table.accessor = accessor
-
-            common = CommonAccessor()
-            common.table_name = tensor_dict["feed_var_name"]
-            common.trainer_num = get_trainers(self.role_maker)
-            common.attrs = ""
-            common.dims = []
-            common.params = []
-            table.common = common
-
-            tensor = Tensor()
-            tensor.main_program_id = tensor_dict["main_program_id"]
-            tensor.startup_program_id = tensor_dict["startup_program_id"]
-            tensor.feed_var_name = tensor_dict["feed_var_name"]
-            tensor.fetch_var_name = tensor_dict["fetch_var_name"]
-            tensor.tensor_table_class = tensor_dict["tensor_table_class"]
-            table.tensor = tensor
-
-            return table
-
-        def _add_tensor_table(tables):
-            tensor_table_dict = {}
-            program_idx = 0
-            for table_name in tensor_table_dict:
-                if tensor_table_dict[table_name]["startup_program"] != None:
-                    tensor_table_dict[table_name][
-                        "startup_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["startup_program"].desc)
-                    program_idx += 1
-                if tensor_table_dict[table_name]["main_program"] != None:
-                    tensor_table_dict[table_name][
-                        "main_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["main_program"].desc)
-                    program_idx += 1
-                # Todo: Hard code for lr_decay table apply table id
-                new_table = _build_tensor_table(
-                    len(tables), tensor_table_dict[table_name])
-                tables.append(new_table)
-            return tables
-
-        def _get_tables():
-            send_ctx = get_the_one_send_context(
-                self.context,
-                use_origin_program=True,
-                split_dense_table=self.is_heter_ps_mode)
-
-            tables = []
-            for idx, (name, ctx) in enumerate(send_ctx.items()):
-                print(" wxm python test send_ctx.items-->", idx, (name, ctx))
-                if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1:
-                    continue
-
-                table = Table()
-                table.id = ctx.table_id()
-                common = CommonAccessor()
-
-                if ctx.is_sparse():
-                    table.type = "PS_SPARSE_TABLE"
-                    table.shard_num = 256
-
-                    common.table_name = self.context['grad_name_to_param_name'][
-                        ctx.origin_varnames()[0]]
-
-                    if self.context['ps_mode'] == DistributedMode.GEO:
-                        table.table_class = "SparseGeoTable"
-                    else:
-                        all_table_proto = self.context[
-                            "user_defined_strategy"].sparse_table_configs
-                        table_proto = all_table_proto.add()
-                        for proto in all_table_proto:
-                            if proto.table_name == common.table_name:
-                                table_proto = proto
-                                break
-                        if table_proto.HasField("table_class"):
-                            table.table_class = table_proto.table_class
-                        else:
-                            table.table_class = parse_table_class(
-                                common.table_name,
-                                ctx.program_id(), self.context)
-                        if table.table_class != 'MemorySparseTable':
-                            table.table_class = 'MemorySparseTable'
-                            warnings.warn(
-                                "The PS mode must use MemorySparseTable.")
-
-                        if table_proto.HasField("shard_num"):
-                            table.shard_num = table_proto.shard_num
-                        else:
-                            table.shard_num = 1000
-                            warnings.warn(
-                                "The shard_num of sparse table is not set, use default value 1000."
-                            )
-
-                        if table_proto.accessor.ByteSize() == 0:
-                            warnings.warn(
-                                "The accessor of sparse table is not set, use default value."
-                            )
-                        get_default_accessor_proto(
-                            table_proto.accessor, common.table_name,
-                            ctx.program_id(), self.context)
-                        check_embedding_dim(table_proto.accessor,
-                                            common.table_name,
-                                            ctx.program_id(), self.context)
-                        table.accessor_proto = text_format.MessageToString(
-                            table_proto.accessor)
-                else:
-                    table.type = "PS_DENSE_TABLE"
-                    table.table_class = "CommonDenseTable"
-                    table.shard_num = 256
-                    common.table_name = "MergedDense"
-
-                adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
-                common.parse_by_optimizer(ctx, self.context)
-
-                if ctx.is_sparse():
-                    common.parse_entry(common.table_name,
-                                       ctx.program_id(), self.context)
-
-                if is_sync:
-                    common.sync = "true"
-                else:
-                    common.sync = "false"
-                table.common = common
-
-                if table.table_class != 'MemorySparseTable':
-                    accessor = _build_merge_accessor(ctx)
-                    table.accessor = accessor
-                tables.append(table)
-
-            tensor_table_dict = {}
-            if len(tensor_table_dict) > 0:
-                tables = _add_tensor_table(tables)
-            else:
-                empty_porgram = Program()
-                self._server_sub_program.append(empty_porgram.desc)
-
-            barrier_table = _build_barrier_table(len(tables))
-            tables.append(barrier_table)
-            return tables
-
-        if is_server:
-            server = Server()
-            downpour_server = DownpourServer()
-
-            service = Service()
-            dist_strategy = self.context["valid_strategy"]
-            use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
-            if use_ps_gpu:
-                service.server_class = "PsLocalServer"
-                service.client_class = "PsLocalClient"
-            downpour_server.set_service_param(service)
-
-            tables = _get_tables()
-            downpour_server.tables = tables
-            server.add_server(downpour_server)
-            return server
-        else:
-            worker = Worker()
-            downpour_worker = DownpourWorker()
-
-            tables = _get_tables()
-            downpour_worker.tables = tables
-            worker.add_worker(downpour_worker)
-            return worker
-
     def _init_server(self, dirname=None, var_names=None, **kwargs):
+        server_desc = self.ps_desc_builder.build_server_desc()
         role_id = get_role_id(self.role_maker)
-        endpoints = get_ps_endpoints(self.role_maker)
         trainers = get_trainers(self.role_maker)
         if self.is_heter_ps_mode:
             trainers += len(self.role_maker._get_heter_worker_endpoints())
-        server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync)
-        proto_txt = str(server)
-        fs_client = fsClient(self.context["user_defined_strategy"]
-                             .fs_client_param)
-        proto_txt = proto_txt + "\n" + fs_client.to_string()
-
-        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-        if debug:
-            print("server: \n{}".format(proto_txt))
-
-        string_hosts = []
-        for idx, ep in enumerate(endpoints):
-            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
-            string_hosts.append(pshost.serialize_to_string())
 
         self._server = fluid.core.DistFleetWrapper()
-        self._server.init_server(proto_txt, string_hosts, role_id, trainers,
-                                 self._server_sub_program)
+        self._server.init_server(server_desc, self.string_hosts, role_id,
+                                 trainers, self._server_sub_program)
 
         dist_varnames = get_sparse_tablenames(self.origin_main_programs, True)
         sparse_varnames = get_sparse_tablenames(self.origin_main_programs,
@@ -1101,10 +978,7 @@ class TheOnePSRuntime(RuntimeBase):
         if dirname is None or not load_varnames:
             return
 
-        sparse_table_maps = {}
-        for table in server.servers[0].tables:
-            if table.type == "PS_SPARSE_TABLE" and table.common is not None:
-                sparse_table_maps[table.common.table_name] = table.id
+        sparse_table_maps = self.ps_desc_builder.sparse_table_maps
 
         dirname = os.path.normpath(dirname)
         pserver_id = self.role_maker._role_id()
@@ -1186,7 +1060,7 @@ class TheOnePSRuntime(RuntimeBase):
         sparses = get_the_one_recv_context(
             self.context,
             is_dense=False,
-            split_dense_table=self.is_heter_ps_mod,
+            split_dense_table=self.is_heter_ps_mode,
             use_origin_program=True)
 
         sparse_varnames = self._save_sparse_params(executor, dirname, sparses,
@@ -1413,7 +1287,7 @@ class TheOnePSRuntime(RuntimeBase):
 
         fleet.util.barrier()
         if self.role_maker._is_first_worker():
-            sparses = sget_the_one_recv_context(
+            sparses = get_the_one_recv_context(
                 self.context,
                 is_dense=False,
                 split_dense_table=self.role_maker.
diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py
index 1a426f3ad6c..701ae8be6cb 100755
--- a/python/paddle/distributed/ps/utils/ps_factory.py
+++ b/python/paddle/distributed/ps/utils/ps_factory.py
@@ -38,5 +38,7 @@ class PsProgramBuilderFactory(object):
         elif 'is_fl_ps_mode' in attrs and attrs[
                 'is_fl_ps_mode'] == DistributedMode.FL:
             return globals()['FlPsProgramBuilder'](pass_ctx)
-        else:
+        elif attrs['ps_mode'] == DistributedMode.SYNC:
             return globals()['CpuSyncPsProgramBuilder'](pass_ctx)
+        else:
+            return globals()['CpuAsyncPsProgramBuilder'](pass_ctx)
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index 25e4dc28bdc..d737542f323 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -95,11 +95,12 @@ class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
 
 class CpuSyncPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
-        logger.info("start building cpu-sync-ps program")
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
+        if self.ps_mode == DistributedMode.SYNC:
+            logger.info("start building cpu-sync-ps program")
         if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
             raise ValueError("ps mode: {} not matched {}",
-                             format(self.ps_mode, "CpuSyncPsProgramBuilder"))
+                             format(self.ps_mode, "PsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ebec6900e38..ab5bd7da09d 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -73,7 +73,9 @@ def logger_config(log_path, logging_name):
     return logger
 
 
-logger = logger_config(log_path='/ps_log', logging_name='ps_log')
+ps_log_root_dir = '/ps_log/'
+logger = logger_config(
+    log_path='/ps_usr_print_log', logging_name='ps_usr_print_log')
 
 
 class DistributedMode:
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
old mode 100644
new mode 100755
index 2f6df075478..1443eebf293
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -627,7 +627,7 @@ set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_passes)
-  
+    add_subdirectory(ps)
     add_subdirectory(auto_parallel)
 
     # FIXME(typhoonzero): add these tests back
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
index 63dd4b8e21e..93a0044a5e4 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
@@ -23,13 +23,24 @@ import unittest
 import numpy as np
 from collections import OrderedDict
 from paddle.distributed.ps.utils.public import logger
-from dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
+from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
 import paddle.distributed.fleet as fleet
 
 
 class PsPassTestBase(unittest.TestCase):
     def init(self):
-        raise NotImplementedError
+        self.config = {}
+        self.config['ps_mode_config'] = ""
+        self.config['worker_num'] = "1"
+        self.config['server_num'] = "1"
+        self.config['run_minimize'] = "0"
+        self.config['run_single_pass'] = "0"
+        self.config['run_the_one_ps'] = '0'
+        self.config['debug_new_minimize'] = "0"
+        self.config['debug_new_pass'] = "0"
+        self.config['debug_the_one_ps'] = '0'
+        self.config['log_dir'] = ""
+        self.config['applied_pass_name'] = ""
 
     def setUp(self):
         print('Ps setUp...')
@@ -37,7 +48,7 @@ class PsPassTestBase(unittest.TestCase):
     def tearDown(self):
         print('Ps tearDown...')
 
-    def ps_launch(self, config, ps_mode="cpu-ps"):
+    def ps_launch(self, ps_mode="cpu-ps"):
         if ps_mode == "cpu-ps" or ps_mode == 'heter-ps':
             os.environ['WITH_DISTRIBUTE'] = 'ON'
 
@@ -45,23 +56,26 @@ class PsPassTestBase(unittest.TestCase):
                 sys.executable,
                 "-u",
             ] + [
-                "-m", "launch", "--log_dir", config['log_dir'], "--worker_num",
-                config['worker_num'], "--server_num", config['server_num']
+                "-m", "launch", "--log_dir", self.config['log_dir'],
+                "--worker_num", self.config['worker_num'], "--server_num",
+                self.config['server_num']
             ]
             if ps_mode == 'heter-ps':
                 os.environ['FLAGS_START_PORT'] = '12004'
                 cmd += [
-                    '--heter_worker_num', config['heter_worker_num'],
-                    '--heter_devices', config['heter_devices']
+                    '--heter_worker_num', self.config['heter_worker_num'],
+                    '--heter_devices', self.config['heter_devices']
                 ]
 
             cmd += [
-                "../ps/ps_dnn_trainer.py", "-m", config['ps_mode_config'],
-                "--run_minimize", config['run_minimize'], "--run_single_pass",
-                config['run_single_pass'], "--debug_new_pass",
-                config['debug_new_pass'], "--debug_new_minimize",
-                config['debug_new_minimize'], "--applied_pass_name",
-                config['applied_pass_name']
+                "../ps/ps_dnn_trainer.py", "-m", self.config['ps_mode_config'],
+                "--run_minimize", self.config['run_minimize'],
+                "--run_single_pass", self.config['run_single_pass'],
+                "--run_the_one_ps", self.config['run_the_one_ps'],
+                "--debug_new_pass", self.config['debug_new_pass'],
+                "--debug_new_minimize", self.config['debug_new_minimize'],
+                "--applied_pass_name", self.config['applied_pass_name'],
+                "--debug_the_one_ps", self.config['debug_the_one_ps']
             ]
         elif ps_mode == "gpu-ps":
             os.environ['FLAGS_LAUNCH_BARRIER'] = '0'
@@ -80,12 +94,14 @@ class PsPassTestBase(unittest.TestCase):
 
             cmd = [
                 sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m",
-                config['ps_mode_config'], "--run_minimize",
-                config['run_minimize'], "--run_single_pass",
-                config['run_single_pass'], "--debug_new_pass",
-                config['debug_new_pass'], "--debug_new_minimize",
-                config['debug_new_minimize'], "--applied_pass_name",
-                config['applied_pass_name']
+                self.config['ps_mode_config'], "--run_minimize",
+                self.config['run_minimize'], "--run_single_pass",
+                self.config['run_single_pass'], "--run_the_one_ps",
+                self.config['run_the_one_ps'], "--debug_new_pass",
+                self.config['debug_new_pass'], "--debug_new_minimize",
+                self.config['debug_new_minimize'], "--applied_pass_name",
+                self.config['applied_pass_name'], "--debug_the_one_ps",
+                self.config['debug_the_one_ps']
             ]
 
         cmd = [shlex.quote(c) for c in cmd]
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index b186869ee97..fd558ef0403 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -21,31 +21,26 @@ import numpy as np
 
 import paddle
 from ps_pass_test_base import *
-from paddle.distributed.ps.utils.public import logger
+from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
 from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer
 
 
 class TestPsTrainerPass(PsPassTestBase):
-    def init(self):
-        self.config = {}
-        self.config['ps_mode_config'] = ""
-        self.config['worker_num'] = "1"
-        self.config['server_num'] = "1"
-        self.config['run_minimize'] = "0"
-        self.config['run_single_pass'] = "0"
-        self.config['debug_new_minimize'] = "0"
-        self.config['debug_new_pass'] = "0"
-        self.config['log_dir'] = ""
-        self.config['applied_pass_name'] = ""
-
     def setUp(self):
         pass
 
     def tearDown(self):
         pass
 
-    def check(self):
-        pass
+    def check(self, file1, file2):
+        with open(file1, 'r', encoding='utf-8') as f:
+            text1 = f.read()
+        with open(file2, 'r', encoding='utf-8') as f:
+            text2 = f.read()
+        if text1 == text2:
+            return True
+        else:
+            return False
 
     def test_ps_optimizer_minimize_cpu_async(self):
         self.init()
@@ -53,16 +48,21 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/async_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/async_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
-        self.check()
+        file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_async passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_async failed!')
 
     def test_ps_optimizer_minimize_cpu_sync(self):
         self.init()
@@ -70,16 +70,22 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/sync_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/sync_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
-
-        self.check()
+        self.ps_launch()
+        '''
+        file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_sync failed!')
+        '''
 
     def test_ps_optimizer_minimize_cpu_geo(self):
         self.init()
@@ -87,16 +93,21 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/geo_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/geo_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
-        self.check()
+        file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_geo passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_geo failed!')
 
     # heter ps 二阶段
     def test_ps_optimizer_minimize_heter(self):
@@ -110,14 +121,24 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['ps_mode_config'] = "../ps/heter_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/heter_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "heter_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, 'heter-ps')
+        self.ps_launch('heter-ps')
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/heter_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "heter_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, 'heter-ps')
+        self.ps_launch('heter-ps')
+        '''
+        file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
+        file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
+        file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
+        if self.check(file1, file2) and self.check(file3, file4):
+            logger.info('test_ps_optimizer_minimize_heter passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_heter failed!')
+        '''
 
     def test_ps_optimizer_minimize_gpu(self):
         self.init()
@@ -125,29 +146,42 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
-        self.ps_launch(self.config, "gpu-ps")
+        self.ps_launch("gpu-ps")
 
         self.config['debug_new_minimize'] = '1'
-        self.ps_launch(self.config, "gpu-ps")
+        self.ps_launch("gpu-ps")
 
-        self.check()
+        file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_gpu passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_gpu failed!')
 
     def test_append_send_ops_pass(self):
         self.init()
         self.config['run_single_pass'] = '1'
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
         self.config['applied_pass_name'] = "append_send_ops_pass"
 
         self.config['debug_new_pass'] = '0'
-        self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name']
+        self.config['log_dir'] = ps_log_root_dir + "log_old_" + self.config[
+            'applied_pass_name']
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, "cpu-ps")
+        self.ps_launch("cpu-ps")
 
         self.config['debug_new_pass'] = '1'
-        self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name']
+        self.config['log_dir'] = ps_log_root_dir + "log_new_" + self.config[
+            'applied_pass_name']
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, "cpu-ps")
-
-        self.check()
+        self.ps_launch("cpu-ps")
+
+        file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_append_send_ops_pass passed!')
+        else:
+            logger.info('test_append_send_ops_pass failed!')
 
     def test_distributed_ops_pass(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
old mode 100644
new mode 100755
index 3aef3283b82..9af32a8aca7
--- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
@@ -3,6 +3,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    list(APPEND TEST_OPS ${TEST_OP})
+    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
 endforeach(TEST_OP)
-
-set_tests_properties(test_the_one_ps PROPERTIES TIMEOUT 50)
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index d08c1d41c89..bc87fc255a5 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -264,12 +264,16 @@ def parse_args():
         '--run_minimize', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--run_single_pass', type=int, default=0, help="test single pass")
+    parser.add_argument(
+        '--run_the_one_ps', type=int, default=0, help="test the_one_ps")
     parser.add_argument(
         '--debug_new_minimize', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--debug_new_pass', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--applied_pass_name', type=str, default="", help="test single pass")
+    parser.add_argument(
+        '--debug_the_one_ps', type=int, default=0, help="test the_one_ps")
 
     args = parser.parse_args()
     args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
@@ -280,9 +284,11 @@ def parse_args():
     config["pure_bf16"] = args.pure_bf16
     config['run_minimize'] = args.run_minimize
     config['run_single_pass'] = args.run_single_pass
+    config['run_the_one_ps'] = args.run_the_one_ps
     config['debug_new_minimize'] = args.debug_new_minimize
     config['debug_new_pass'] = args.debug_new_pass
     config['applied_pass_name'] = args.applied_pass_name
+    config['debug_the_one_ps'] = args.debug_the_one_ps
     yaml_helper.print_yaml(config)
     return config
 
@@ -344,15 +350,15 @@ class DnnTrainer(object):
             fleet_obj.minimize(loss)
 
         if fleet.is_server():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_server_main.prototxt'
             debug_program(_main_file, loss.block.program)
         elif fleet.is_worker():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
         elif self.role_maker._is_heter_worker():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config[
                     'debug_new_minimize']) + '_heter_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
@@ -397,16 +403,84 @@ class DnnTrainer(object):
             _main = worker.append_send_ops_pass(_main, compiled_config)
 
         if fleet.is_server():
-            _main_file = '/' + sync_mode + "_" + str(config[
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_server_main.prototxt'
             debug_program(_main_file, _main)
         elif fleet.is_worker():
-            _main_file = '/' + sync_mode + "_" + str(config[
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_worker_main.prototxt'
             debug_program(_main_file, _main)
 
+    def run_the_one_ps(self):
+        self.init_fleet_with_gloo()
+        self.model = get_model(self.config)
+        self.input_data = self.model.create_feeds()
+        self.metrics = self.model.net(self.input_data)
+        loss = self.model._cost
+        user_defined_strategy = get_user_defined_strategy(self.config)
+        learning_rate = self.config.get(
+            "hyper_parameters.optimizer.learning_rate")
+        sync_mode = self.config.get("runner.sync_mode")
+        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
+
+        self.role_maker._generate_role()  # 必要
+        if self.config['debug_the_one_ps'] == 1:
+            logger.info("entering run_the_one_ps -- new")
+
+            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
+            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
+            ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
+                                         user_defined_strategy)
+            ps_optimizer.minimize_impl(loss)
+
+            from paddle.distributed.ps.the_one_ps import TheOnePSRuntime
+            _runtime_handle = TheOnePSRuntime()  # ps 目录下重构版的 TheOnePSRuntime
+            _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs)
+            if fleet.is_worker():
+                worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc(
+                )
+                with open(ps_log_root_dir + sync_mode + '_' +
+                          'new_worker_ps_desc', 'w') as f:
+                    f.write(worker_desc)
+            if fleet.is_server():
+                server_desc = _runtime_handle.ps_desc_builder.build_server_desc(
+                )
+                with open(ps_log_root_dir + sync_mode + '_' +
+                          'new_server_ps_desc', 'w') as f:
+                    f.write(server_desc)
+
+        else:
+            pass
+        '''          
+            logger.info("entering run_the_one_ps -- old")
+            fleet_obj = fleet.distributed_optimizer(
+                inner_optimizer, user_defined_strategy)  
+            fleet_obj.minimize(loss)  
+            if fleet.is_worker():
+                worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False)
+                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
+                with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f:
+                    f.write(str(worker_desc) + str(server_desc))
+            if fleet.is_server():
+                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
+                with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f:
+                    f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string()))
+        '''
+        if fleet.is_server():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_server_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+        elif fleet.is_worker():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+        elif self.role_maker._is_heter_worker():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_heter_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+
 
 if __name__ == "__main__":
     paddle.enable_static()
@@ -418,3 +492,5 @@ if __name__ == "__main__":
         benchmark_main.run_single_pass()
     elif config['run_minimize'] == 1:
         benchmark_main.run_minimize()
+    elif config['run_the_one_ps'] == 1:
+        benchmark_main.run_the_one_ps()
diff --git a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
old mode 100644
new mode 100755
index 78bae0e50c5..8dddc6abd4c
--- a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
+++ b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
@@ -22,16 +22,100 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 
+import paddle
+from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import *
+from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
+from ps_dnn_trainer import DnnTrainer
+from paddle.distributed.fleet.proto import ps_pb2
+from google.protobuf import text_format
+
 
-class TestTheOnePs(unittest.TestCase):
+class TestTheOnePs(PsPassTestBase):
     def setUp(self):
-        print('setUp...')
+        pass
 
     def tearDown(self):
-        print('tearDown...')
+        pass
 
-    def test_main(self):
+    def check(self, file1, file2):
         pass
+        '''
+        f = open(file1, "rb")
+        ps_desc_1 = ps_pb2.PSParameter()
+        text_format.Parse(f.read(), ps_desc_1)
+        f.close()
+
+        f = open(file2, "rb")
+        ps_desc_2 = ps_pb2.PSParameter()
+        text_format.Parse(f.read(), ps_desc_2)
+        f.close()
+        str1 = text_format.MessageToString(ps_desc_1)
+        str2 = text_format.MessageToString(ps_desc_2)
+        #logger.info('### msg10: {}'.format(str1))
+        #logger.info('### msg20: {}'.format(str2))
+        if str1 == str2:
+            return True
+        else:
+            return False
+        '''
+
+    def test_ps_cpu_async(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
+        self.config['run_the_one_ps'] = '1'
+
+        self.config['debug_the_one_ps'] = '0'
+        self.config[
+            'log_dir'] = ps_log_root_dir + "async_cpu_log_old_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        self.config['debug_the_one_ps'] = '1'
+        self.config[
+            'log_dir'] = ps_log_root_dir + "async_cpu_log_new_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        desc1 = '/ps_desc_baseline/async_worker_ps_desc'
+        desc2 = '/ps_log/async_new_worker_ps_desc'
+        desc3 = '/ps_desc_baseline/async_server_ps_desc'
+        desc4 = '/ps_log/async_new_server_ps_desc'
+        if self.check(desc1, desc2):
+            logger.info('test_ps_cpu_async ps_desc: worker passed!')
+        else:
+            logger.info('test_ps_cpu_async ps_desc: worker failed!')
+        if self.check(desc3, desc4):
+            logger.info('test_ps_cpu_async ps_desc: server passed!')
+        else:
+            logger.info('test_ps_cpu_async ps_desc: server failed!')
+
+    def test_ps_cpu_geo(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml"
+        self.config['run_the_one_ps'] = '1'
+
+        self.config['debug_the_one_ps'] = '0'
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        self.config['debug_the_one_ps'] = '1'
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        desc1 = '/ps_desc_baseline/geo_worker_ps_desc'
+        desc2 = '/ps_log/geo_new_worker_ps_desc'
+        desc3 = '/ps_desc_baseline/geo_server_ps_desc'
+        desc4 = '/ps_log/geo_new_server_ps_desc'
+        if self.check(desc1, desc2):
+            logger.info('test_ps_cpu_geo ps_desc: worker passed!')
+        else:
+            logger.info('test_ps_cpu_geo ps_desc: worker failed!')
+        if self.check(desc3, desc4):
+            logger.info('test_ps_cpu_geo ps_desc: server passed!')
+        else:
+            logger.info('test_ps_cpu_geo ps_desc: server failed!')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ps_dnn_model.py b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
index 0a147334dab..8d91e0f4678 100755
--- a/python/paddle/fluid/tests/unittests/ps_dnn_model.py
+++ b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
@@ -74,6 +74,7 @@ class DNNLayer(nn.Layer):
             else:
                 emb = self.embedding(s_input)
             emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
+            # emb.stop_gradient = True
             sparse_embs.append(emb)
 
         y_dnn = paddle.concat(x=sparse_embs + [dense_inputs], axis=1)
-- 
GitLab


From 28795771408a6dcd757ed367d348fb0ead5ab507 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 2 Mar 2022 16:40:05 +0800
Subject: [PATCH 058/272] run recompute's real backward with amp disabled
 (#40042)

---
 python/paddle/distributed/fleet/utils/recompute.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index dccd7f62053..4ccb48ef72e 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -182,9 +182,10 @@ class RecomputeFunction(PyLayer):
                     "none of output has requires_grad=True, this recompute() is not necessary"
                 )
 
-            # actually backward            
-            paddle.autograd.backward(forward_outputs_with_grad,
-                                     backward_inputs_with_grad)
+            # actually backward
+            with paddle.amp.auto_cast(enable=False):
+                paddle.autograd.backward(forward_outputs_with_grad,
+                                         backward_inputs_with_grad)
 
             grads = list(inp._grad_ivar() for inp in detached_inputs
                          if isinstance(inp, core.VarBase))
-- 
GitLab


From 8492d3bbf6f01e98d6674b57b27913fe537584dd Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Wed, 2 Mar 2022 16:43:52 +0800
Subject: [PATCH 059/272] The backward code of Sparse Conv3d (#40054)

Sparse Conv3d backward code
---
 .../kernels/sparse/convolution_grad_kernel.h  |  66 +++++++
 paddle/phi/kernels/sparse/cpu/convolution.h   |   1 +
 .../sparse/cpu/convolution_grad_kernel.cc     | 166 ++++++++++++++++++
 .../kernels/test_sparse_conv3d_dev_api.cc     | 112 +++++++++++-
 4 files changed, 337 insertions(+), 8 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/convolution_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc

diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
new file mode 100644
index 00000000000..1a6ac852448
--- /dev/null
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad);
+
+template <typename T, typename Context>
+std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
+                                    const SparseCooTensor& x,
+                                    const DenseTensor& rulebook,
+                                    const DenseTensor& kernel,
+                                    const SparseCooTensor& out_grad,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& dilations,
+                                    const std::vector<int>& strides,
+                                    const int groups) {
+  DenseTensor x_grad = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor kernel_grad = phi::Empty<T, Context>(dev_ctx);
+  Conv3dGradKernel<T, Context>(dev_ctx,
+                               x,
+                               rulebook,
+                               kernel,
+                               out_grad,
+                               paddings,
+                               dilations,
+                               strides,
+                               groups,
+                               &x_grad,
+                               &kernel_grad);
+  std::vector<DenseTensor> out(2);
+  out[0] = x_grad;
+  out[1] = kernel_grad;
+  return out;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index 5803069d927..ab2fef5320f 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
new file mode 100644
index 00000000000..d4f770ce871
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  dev_ctx.Alloc(
+      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  dev_ctx.Alloc(
+      &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel());
+  T* d_x_features_ptr = d_x_features.data<T>();
+  dev_ctx.Alloc(&out_grad_features,
+                out_grad_features.dtype(),
+                sizeof(T) * out_grad_features.numel());
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->Resize(kernel_dims);
+  dev_ctx.Alloc(
+      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  T* d_kernel_ptr = kernel_grad->data<T>();
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[rulebook_ptr[i]] += 1;
+  }
+  int offset = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[kernel_size] = offset;
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  Scatter<T>(d_x_features_ptr,
+             rulebook.data<int>() + rulebook_len,
+             rulebook_len,
+             in_channels,
+             x_grad_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 57601514370..00b2a256a95 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -59,7 +60,10 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const std::vector<int>& paddings,
                     const std::vector<int>& strides,
                     const std::vector<int>& dilations,
-                    const float diff = 1e-3) {
+                    const float diff = 1e-3,
+                    const bool backward = false,
+                    const std::vector<T> features_grad = {},
+                    const std::vector<T> kernel_grad = {}) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -122,10 +126,29 @@ void TestConv3dBase(const std::vector<int>& indices,
                              correct_out_indices.size() * sizeof(int));
     ASSERT_EQ(cmp_indices, 0);
 
-    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
-      float tmp = std::fabs(static_cast<float>(
-          correct_out_features[i] - out.non_zero_elements().data<T>()[i]));
-      ASSERT_LT(tmp, diff);
+    auto f_verify = [&](const T* real_data,
+                        const std::vector<T>& correct_data) {
+      for (uint64_t i = 0; i < correct_data.size(); i++) {
+        float tmp =
+            std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+        ASSERT_LT(tmp, diff);
+      }
+    };
+
+    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
+
+    if (backward) {
+      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
+                                                             x_tensor,
+                                                             rulebook,
+                                                             kernel_tensor,
+                                                             out,
+                                                             paddings,
+                                                             dilations,
+                                                             strides,
+                                                             1);
+      f_verify(grads[0].data<T>(), features_grad);
+      f_verify(grads[1].data<T>(), kernel_grad);
     }
   }
 }
@@ -141,7 +164,11 @@ void TestConv3d(const std::vector<int>& indices,
                 const int non_zero_num,
                 const std::vector<int>& paddings,
                 const std::vector<int>& strides,
-                const std::vector<int>& dilations) {
+                const std::vector<int>& dilations,
+                const float diff = 1e-3,
+                const bool backward = false,
+                const std::vector<float> features_grad = {},
+                const std::vector<float> kernel_grad = {}) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -154,7 +181,11 @@ void TestConv3d(const std::vector<int>& indices,
                         non_zero_num,
                         paddings,
                         strides,
-                        dilations);
+                        dilations,
+                        diff,
+                        backward,
+                        features_grad,
+                        kernel_grad);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -167,7 +198,11 @@ void TestConv3d(const std::vector<int>& indices,
                          non_zero_num,
                          paddings,
                          strides,
-                         dilations);
+                         dilations,
+                         diff,
+                         backward,
+                         cast<float, double>(features_grad),
+                         cast<float, double>(kernel_grad));
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -467,5 +502,66 @@ TEST(DEV_API, sparse_conv2d) {
              dilations);
 }
 
+TEST(DEV_API, sparse_conv3d_backward) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 2;
+  std::vector<int> indices_flatten = {0, 0, 0, 2, 3, 2, 3, 2};
+
+  std::vector<float> features = {-0.28833008, 0.0287323};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.64306641, 0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641,
+      0.57861328, 0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038,
+      0.46459961, 0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077,
+      0.69628906, 0.26904297, 0.74707031, 0.54003906, 0.5390625,  0.07958984,
+      0.47338867, 0.90966797, 0.17126465};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
+                                          1, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+
+  std::vector<float> out_features = {4.9200e-03,
+                                     2.6140e-02,
+                                     2.2900e-03,
+                                     -2.3596e-01,
+                                     1.5000e-04,
+                                     1.0670e-02,
+                                     5.7200e-03,
+                                     1.2850e-02};
+
+  std::vector<float> features_grad = {-0.20593, -0.09149};
+  std::vector<float> kernel_grad = {
+      0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,  0.000e+00,
+      0.000e+00, 0.000e+00, 6.805e-02, 0.000e+00, 0.000e+00,  0.000e+00,
+      0.000e+00, 3.700e-04, 1.600e-04, 0.000e+00, 3.100e-04,  0.000e+00,
+      0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, -6.780e-03, 7.000e-05,
+      0.000e+00, 7.500e-04, 1.400e-04};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad);
+}
+
 }  // namespace tests
 }  // namespace phi
-- 
GitLab


From 2a5590a18e3dd90f815f20a82f6dcc722bc17892 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Wed, 2 Mar 2022 16:55:19 +0800
Subject: [PATCH 060/272] Move BroadcastTensors OP to phi (#40047)

* Move BroadcastTensors OP to phi

* Remove mutable_data in impl

* Move BilinearTensorProductInferMeta to multiary.h/cc
---
 .../fluid/operators/broadcast_tensors_op.cc   |  99 +-----
 .../fluid/operators/broadcast_tensors_op.cu   | 122 --------
 paddle/fluid/operators/broadcast_tensors_op.h | 282 ------------------
 paddle/phi/infermeta/multiary.cc              |  66 +++-
 paddle/phi/infermeta/multiary.h               |   5 +
 .../kernels/broadcast_tensors_grad_kernel.h   |  27 ++
 paddle/phi/kernels/broadcast_tensors_kernel.h |  27 ++
 paddle/phi/kernels/complex_grad_kernel.h      |   2 +-
 paddle/phi/kernels/complex_kernel.h           |  14 +-
 .../cpu/broadcast_tensors_grad_kernel.cc      | 201 +++++++++++++
 .../kernels/cpu/broadcast_tensors_kernel.cc   |  30 ++
 .../gpu/broadcast_tensors_grad_kernel.cu      | 111 +++++++
 .../kernels/gpu/broadcast_tensors_kernel.cu   |  30 ++
 .../impl/broadcast_tensors_kernel_impl.h      | 118 ++++++++
 .../phi/ops/compat/broadcast_tensors_sig.cc   |  28 ++
 15 files changed, 658 insertions(+), 504 deletions(-)
 delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.cu
 delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.h
 create mode 100644 paddle/phi/kernels/broadcast_tensors_grad_kernel.h
 create mode 100644 paddle/phi/kernels/broadcast_tensors_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/broadcast_tensors_sig.cc

diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 27b1107675d..c3917fad555 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "broadcast_tensors");
-
-    int target_rank = 0;
-    const auto& input_dims = ctx->GetInputsDim("X");
-
-    // 1. Find Output rank = max(Inputs rank)
-    for (const auto& input_ddim : input_dims) {
-      target_rank = std::max(target_rank, input_ddim.size());
-    }
-
-    PADDLE_ENFORCE_GT(
-        target_rank, 0,
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp requires at least one input tensor"
-            "to have rank greater than zero"));
-
-    std::vector<int64_t> target_dims(target_rank, 0);
-    // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
-    for (int index = 0; index < target_rank; index++) {
-      // Loop axes in reverse order,
-      // For each axis, take the maximum as target size
-      // Fill size = 1 if shape vector exhausts
-      int target_dim_size = 1;
-      for (const auto& input_ddim : input_dims) {
-        // Reversed order
-        int axis = static_cast<int>(input_ddim.size()) - index - 1;
-        int dim_size = 1;
-        if (axis >= 0) {
-          dim_size = input_ddim[axis];
-        }
-
-        if (target_dim_size != 1 && dim_size != 1 &&
-            target_dim_size != dim_size) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "BroadcastTensorsOp inputs does not satisfy bcast semantics,"
-              "Please check axis = %d in reverse order",
-              index));
-        }
-
-        // We performed bcast semantics check at python level
-        // So input tensors should all have legal shape
-        target_dim_size = std::max(target_dim_size, dim_size);
-      }
-      target_dims[target_rank - index - 1] = target_dim_size;
-    }
-
-    // 3. Set Output Dim
-    std::vector<DDim> output_ddims;
-    for (size_t i = 0; i < input_dims.size(); i++) {
-      output_ddims.emplace_back(phi::make_ddim(target_dims));
-    }
-    ctx->SetOutputsDim("Out", output_ddims);
-    ctx->ShareAllLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+                            BroadcastTensorsInferShapeFunctor,
+                            PT_INFER_META(phi::BroadcastTensorsInferMeta));
+
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
                   ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
                   ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BroadcastTensorsOpVarTypeInference);
+                  ops::BroadcastTensorsOpVarTypeInference,
+                  BroadcastTensorsInferShapeFunctor);
 
 REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
                   ops::BroadcastTensorsGradOpVarTypeInference,
                   ops::BroadcastTensorsGradNoNeedBufVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors_grad,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      plat::float16>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      double>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
deleted file mode 100644
index 5882258317d..00000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-
-template <typename T>
-class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const DDim& input_dims = input_tensor->dims();
-      const DDim& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // Collect reduce_dims
-      // Example:
-      // dX  = [1,1,1,1]
-      // dOut = [1,1,1,4]
-      //
-      // reduce_dims  = [3] // reduce along the broadcasted axis
-      std::vector<int> reduce_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // Turns out to be a No-Op, simply copy tensors
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        // reduce_sum implementation on CUDA
-        auto stream = context.cuda_device_context().stream();
-        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            context.cuda_device_context(), *input_tensor, output_tensor,
-            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
-                        ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
-                        ops::CUDABroadcastTensorsGradOpKernel<float>,
-                        ops::CUDABroadcastTensorsGradOpKernel<double>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
deleted file mode 100644
index 682f2e24769..00000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.h
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define SWITCH_OUT_RANK_CASE(n)                                \
-  case n: {                                                    \
-    ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
-    break;                                                     \
-  }
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-using framework::EigenTensor;
-
-template <typename DeviceContext, typename T>
-class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto& in_tensors = context.MultiInput<Tensor>("X");
-    auto out_tensors = context.MultiOutput<Tensor>("Out");
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // Eigen has no support for dynamic ranked tensor
-    // Thus we perform static expansion for each possible ranks
-    for (size_t i = 0; i < num_ins; i++) {
-      int out_rank = out_tensors[i]->dims().size();
-      switch (out_rank) {
-        SWITCH_OUT_RANK_CASE(1)
-        SWITCH_OUT_RANK_CASE(2)
-        SWITCH_OUT_RANK_CASE(3)
-        SWITCH_OUT_RANK_CASE(4)
-        SWITCH_OUT_RANK_CASE(5)
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Target tensor rank out of range"
-              "Maximum supported rank for broadcast is: 5"));
-        }
-      }
-    }
-  }
-
-  template <int OutRank>
-  void ApplyBroadcast(const framework::ExecutionContext& context,
-                      const Tensor* input_tensor, Tensor* output_tensor) const {
-    const auto& input_dims = input_tensor->dims();
-    const auto& output_dims = output_tensor->dims();
-
-    int in_rank = input_dims.size();
-    int out_rank = output_dims.size();
-
-    // 1. Collect bcast_dims, each element of which indicates how many
-    // times we need to replicate along the corresponding dimension
-    // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
-    // both input and output tensors, so we need to initialize input X with
-    // expanded dims: "new_input_dims_vec"
-    Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
-    std::vector<int64_t> new_input_dims_vec(out_rank);
-    for (int j = 0; j < out_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
-
-      bcast_dims[out_axis] = output_dims[out_axis];
-      new_input_dims_vec[out_axis] = 1;
-      if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
-        bcast_dims[out_axis] = 1;
-        new_input_dims_vec[out_axis] = input_dims[in_axis];
-      }
-    }
-    auto new_input_dims = phi::make_ddim(new_input_dims_vec);
-
-    // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
-    // output
-    auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
-
-    output_tensor->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
-                                                                    bcast_dims);
-  }
-};
-
-#define SWITCH_RESHAPE_DIMS(n)                                                \
-  case n: {                                                                   \
-    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
-    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
-      reshape_dims[i] = reshape_dims_vec[i];                                  \
-    }                                                                         \
-    dX.device(place) =                                                        \
-        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
-    break;                                                                    \
-  }
-
-#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
-  case m: {                                               \
-    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
-    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
-      reduce_dims[i] = reduce_dims_vec[i];                \
-    }                                                     \
-    switch (reshape_size) {
-#define LOWER_SWITCH_REDUCE_DIMS                             \
-  default: {                                                 \
-    PADDLE_THROW(platform::errors::InvalidArgument(          \
-        "Detected reshape size: %d out of range"             \
-        "Minimum value should be larger than reduce size %d" \
-        "While maximum supported is: 5",                     \
-        reshape_size, reduce_size));                         \
-  }                                                          \
-    }                                                        \
-    break;                                                   \
-    }
-
-/* ----- GradOpKernel ----- */
-template <typename DeviceContext, typename T>
-class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      const auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const auto& input_dims = input_tensor->dims();
-      const auto& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
-      // Here we perform the following Eigen operations:
-      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-      // reshape(dX_shape) -> dX
-      // Note the last "reshape(dX_shape)" will be performed implicitly,
-      // and we only need to collect reduce_dims and reshape_dims
-      std::vector<int> reduce_dims_vec;
-      std::vector<int> reshape_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        reshape_dims_vec.push_back(input_dims[j]);
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      size_t reduce_size = reduce_dims_vec.size();
-      size_t reshape_size = reshape_dims_vec.size();
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // If this turns out to be a No-Op, simply perform a tensor copy
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "The number of dimensions of the input "
-                              "'Out@GRAD' for Op(broadcast_tensors)"
-                              " must be greater than or equal to 1, but "
-                              "the value received is %d.",
-                              reduce_dims_vec.size()));
-        PADDLE_ENFORCE_LE(
-            reduce_dims_vec.size(), 5,
-            platform::errors::InvalidArgument(
-                "The number of dimensions of the input 'Out@GRAD' "
-                "for Op(broadcast_tensors) must be less than or equal "
-                "to 5, but the value received is %d.",
-                reduce_dims_vec.size()));
-
-        // Overall:
-        // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-        // reshape(dX_shape) -> dX
-        auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
-        auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-
-        // Expand ReduceSize and ReshapeSize into static values
-        switch (reduce_size) {
-          UPPER_SWITCH_REDUCE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(5)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          default: {
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "Detected reduce size: %d out of range"
-                "While maximum supported is: 5",
-                reduce_size));
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7634e5e01ac..dc5478e8afb 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -13,11 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
-
+#include <vector>
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
+  std::vector<DDim> dims;
+  dims.reserve(tensors.size());
+  for (const MetaTensor* tensor : tensors) {
+    dims.emplace_back(tensor->dims());
+  }
+  return dims;
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out) {
+  int target_rank = 0;
+  const auto& input_dims = GetMetaTensorsDim(x);
+
+  // 1. Find Output rank = max(Inputs rank)
+  for (const auto& input_ddim : input_dims) {
+    target_rank = std::max(target_rank, input_ddim.size());
+  }
+
+  PADDLE_ENFORCE_GT(target_rank,
+                    0,
+                    errors::InvalidArgument("BroadcastTensorsOp requires at "
+                                            "least one input tensor to have "
+                                            "rank greater than zero"));
+
+  std::vector<int64_t> target_dims(target_rank, 0);
+  // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
+  for (int index = 0; index < target_rank; index++) {
+    // Loop axes in reverse order,
+    // For each axis, take the maximum as target size
+    // Fill size = 1 if shape vector exhausts
+    int target_dim_size = 1;
+    for (const auto& input_ddim : input_dims) {
+      // Reversed order
+      int axis = static_cast<int>(input_ddim.size()) - index - 1;
+      int dim_size = 1;
+      if (axis >= 0) {
+        dim_size = input_ddim[axis];
+      }
+
+      if (target_dim_size != 1 && dim_size != 1 &&
+          target_dim_size != dim_size) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "BroadcastTensorsOp inputs does not satisfy bcast semantics, "
+            "please check axis = %d in reverse order",
+            index));
+      }
+
+      // We performed bcast semantics check at python level
+      // So input tensors should all have legal shape
+      target_dim_size = std::max(target_dim_size, dim_size);
+    }
+    target_dims[target_rank - index - 1] = target_dim_size;
+  }
+
+  // 3. Set Output Dim
+  for (size_t i = 0; i < out.size(); i++) {
+    out[i]->set_dims(phi::make_ddim(target_dims));
+    out[i]->share_lod(*(x[i]));
+    out[i]->set_dtype(x[i]->dtype());
+  }
+}
+
 void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 2afb79daa35..51738c5e08e 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out);
+
 void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
new file mode 100644
index 00000000000..5ec2e35cc9b
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
new file mode 100644
index 00000000000..fb2a6f1136c
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<DenseTensor>& x,
+                            std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index 505d4d37442..be13e2826ea 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 44bfae9820a..3b3003392d3 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename DeviceContext>
-void RealKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
-
-template <typename T, typename DeviceContext>
-void ImagKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
new file mode 100644
index 00000000000..7a97f8c2189
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_RESHAPE_DIMS(n)                                                \
+  case n: {                                                                   \
+    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
+    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
+      reshape_dims[i] = reshape_dims_vec[i];                                  \
+    }                                                                         \
+    dX.device(place) =                                                        \
+        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
+    break;                                                                    \
+  }
+
+#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
+  case m: {                                               \
+    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
+    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
+      reduce_dims[i] = reduce_dims_vec[i];                \
+    }                                                     \
+    switch (reshape_size) {
+#define LOWER_SWITCH_REDUCE_DIMS                             \
+  default: {                                                 \
+    PADDLE_THROW(errors::InvalidArgument(                    \
+        "Detected reshape size: %d out of range"             \
+        "Minimum value should be larger than reduce size %d" \
+        "While maximum supported is: 5",                     \
+        reshape_size,                                        \
+        reduce_size));                                       \
+  }                                                          \
+    }                                                        \
+    break;                                                   \
+    }
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs, but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    const auto* input_tensor = &in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const auto& input_dims = input_tensor->dims();
+    const auto& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
+    // Here we perform the following Eigen operations:
+    // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+    // reshape(dX_shape) -> dX
+    // Note the last "reshape(dX_shape)" will be performed implicitly,
+    // and we only need to collect reduce_dims and reshape_dims
+    std::vector<int> reduce_dims_vec;
+    std::vector<int> reshape_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      reshape_dims_vec.push_back(input_dims[j]);
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    size_t reduce_size = reduce_dims_vec.size();
+    size_t reshape_size = reshape_dims_vec.size();
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // If this turns out to be a No-Op, simply perform a tensor copy
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      PADDLE_ENFORCE_GE(
+          reduce_dims_vec.size(),
+          1,
+          errors::InvalidArgument("The number of dimensions of the input "
+                                  "'Out@GRAD' for Op(broadcast_tensors)"
+                                  " must be greater than or equal to 1, but "
+                                  "the value received is %d.",
+                                  reduce_dims_vec.size()));
+      PADDLE_ENFORCE_LE(
+          reduce_dims_vec.size(),
+          5,
+          errors::InvalidArgument(
+              "The number of dimensions of the input 'Out@GRAD' "
+              "for Op(broadcast_tensors) must be less than or equal "
+              "to 5, but the value received is %d.",
+              reduce_dims_vec.size()));
+
+      // Overall:
+      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+      // reshape(dX_shape) -> dX
+      auto dX = EigenVector<T>::Flatten(*output_tensor);
+      auto dOut = EigenVector<T>::Flatten(*input_tensor);
+      auto& place = *ctx.eigen_device();
+
+      // Expand ReduceSize and ReshapeSize into static values
+      switch (reduce_size) {
+        UPPER_SWITCH_REDUCE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(5)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        default: {
+          PADDLE_THROW(
+              errors::InvalidArgument("Detected reduce size: %d out of range"
+                                      "While maximum supported is: 5",
+                                      reduce_size));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
new file mode 100644
index 00000000000..4cb6db87692
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
new file mode 100644
index 00000000000..6fb24d72145
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(
+      num_ins,
+      out_tensors.size(),
+      errors::InvalidArgument(
+          "BroadcastTensorsOp expects equal number of inputs and outputs,"
+          "but received: %d inputs v.s %d outputs",
+          num_ins,
+          out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    auto* input_tensor = &in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const DDim& input_dims = input_tensor->dims();
+    const DDim& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // Collect reduce_dims
+    // Example:
+    // dX  = [1,1,1,1]
+    // dOut = [1,1,1,4]
+    //
+    // reduce_dims  = [3] // reduce along the broadcasted axis
+    std::vector<int> reduce_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // Turns out to be a No-Op, simply copy tensors
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      // reduce_sum implementation on CUDA
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx,
+          *input_tensor,
+          output_tensor,
+          kps::IdentityFunctor<T>(),
+          reduce_dims_vec,
+          ctx.stream());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
new file mode 100644
index 00000000000..aa45bd3c438
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
new file mode 100644
index 00000000000..eb01b83377c
--- /dev/null
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_OUT_RANK_CASE(n)                                         \
+  case n: {                                                             \
+    ApplyBroadcast<T, Context, n>(ctx, &in_tensors[i], out_tensors[i]); \
+    break;                                                              \
+  }
+
+namespace phi {
+
+template <typename T, typename Context, int OutRank>
+void ApplyBroadcast(const Context& ctx,
+                    const DenseTensor* input_tensor,
+                    DenseTensor* output_tensor) {
+  const auto& input_dims = input_tensor->dims();
+  const auto& output_dims = output_tensor->dims();
+
+  int in_rank = input_dims.size();
+  int out_rank = output_dims.size();
+
+  // 1. Collect bcast_dims, each element of which indicates how many
+  // times we need to replicate along the corresponding dimension
+  // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
+  // both input and output tensors, so we need to initialize input X with
+  // expanded dims: "new_input_dims_vec"
+  Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
+  std::vector<int64_t> new_input_dims_vec(out_rank);
+  for (int j = 0; j < out_rank; j++) {
+    int out_axis = out_rank - j - 1;
+    int in_axis = in_rank - j - 1;
+
+    bcast_dims[out_axis] = output_dims[out_axis];
+    new_input_dims_vec[out_axis] = 1;
+    if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
+      bcast_dims[out_axis] = 1;
+      new_input_dims_vec[out_axis] = input_dims[in_axis];
+    }
+  }
+  auto new_input_dims = phi::make_ddim(new_input_dims_vec);
+
+  // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
+  // output
+  auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
+
+  ctx.template Alloc<T>(output_tensor);
+  auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
+
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(
+      place, y, x, bcast_dims);
+}
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<DenseTensor>& x,
+                            std::vector<DenseTensor*> out) {
+  const auto& in_tensors = x;
+  auto out_tensors = out;
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs,but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // Eigen has no support for dynamic ranked tensor
+  // Thus we perform static expansion for each possible ranks
+  for (size_t i = 0; i < num_ins; i++) {
+    int out_rank = out_tensors[i]->dims().size();
+    switch (out_rank) {
+      SWITCH_OUT_RANK_CASE(1)
+      SWITCH_OUT_RANK_CASE(2)
+      SWITCH_OUT_RANK_CASE(3)
+      SWITCH_OUT_RANK_CASE(4)
+      SWITCH_OUT_RANK_CASE(5)
+      default: {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Target tensor rank out of range"
+            "Maximum supported rank for broadcast is: 5"));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
new file mode 100644
index 00000000000..2c979c4aedc
--- /dev/null
+++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BroadcastTensorsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad,
+                           phi::BroadcastTensorsGradOpArgumentMapping);
-- 
GitLab


From 7a857924570084851be8b6094f181f217d58fb7c Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 2 Mar 2022 17:18:53 +0800
Subject: [PATCH 061/272] Move transpose to pten (#39327)

* immigrate_transpose_to_pten cpu kernel only; test=develop

* fix bug; test=develop

* add transpose cuda api

* bug fix;

* fix bugs

* fix bugs; test=develop

* bug fix;

* move transepose to pten; test=develop

* fix bug; test=develop

* fix bugs; test=develop

* add transpose grad fp16 support; test=develop

* fix bug; test=develop

* fix npu bug; test=develop

* fix nemul = 0 bug; test=develop

* add fp16 support; test=develop

* fix data type register bug; test=develop

* fix transpose bug; test=develop

* update transpose

* fix transpose bug; test=develop

* remove useless code; test=develop

* remove useless code; test=develop

* fix transpose alias bug; test=develop

* polish code; test=develop

* resolve confict; test=develop

* resolve confilct; test=develop

* recover prepared operator; test=develop

* fix bug; test=develop

* polish code; test=develop

* fix bug; test=develop

* fix bug; test=develop
---
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   |   2 +-
 paddle/fluid/operators/transpose_op.cc        |  60 ++------
 paddle/fluid/operators/transpose_op.cu        | 139 ------------------
 paddle/fluid/operators/transpose_op.cu.h      |  42 +++---
 paddle/fluid/operators/transpose_op.h         |  58 --------
 .../fluid/operators/transpose_op_npu_test.cc  |   2 +-
 .../phi/kernels/cpu/transpose_grad_kernel.cc  |  32 ++++
 paddle/phi/kernels/cpu/transpose_kernel.cc    |  80 ++++++++++
 paddle/phi/kernels/funcs/math_function.cu     |  51 +++++++
 .../phi/kernels/gpu/transpose_grad_kernel.cu  |  34 +++++
 paddle/phi/kernels/gpu/transpose_kernel.cu    |  57 +++++++
 .../kernels/impl/transpose_grad_kernel_impl.h |  38 +++++
 paddle/phi/kernels/transpose_grad_kernel.h    |  28 ++++
 paddle/phi/kernels/transpose_kernel.h         |  28 ++++
 paddle/phi/ops/compat/transpose_sig.cc        |  38 +++++
 .../unittests/parallel_executor_test_base.py  |   2 +-
 ..._imperative_lod_tensor_to_selected_rows.py |   1 +
 .../test_parallel_executor_transformer.py     |   1 +
 ...test_partial_eager_deletion_transformer.py |   2 +
 .../tests/unittests/test_transpose_op.py      |   1 +
 20 files changed, 426 insertions(+), 270 deletions(-)
 delete mode 100644 paddle/fluid/operators/transpose_op.cu
 create mode 100644 paddle/phi/kernels/cpu/transpose_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/transpose_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/transpose_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/transpose_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/transpose_grad_kernel.h
 create mode 100644 paddle/phi/kernels/transpose_kernel.h
 create mode 100644 paddle/phi/ops/compat/transpose_sig.cc

diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 52e2caaeb6e..3791fed23a8 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -29,7 +29,7 @@ USE_OP(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(transpose);
+USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 768ab21936f..1a297e7238c 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class TransposeGradInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SyncTypeAndDataType(framework::GradVarName("Out"),
+                             framework::GradVarName("X"));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -347,59 +355,13 @@ REGISTER_OPERATOR(
     transpose, ops::TransposeOp, ops::TransposeOpMaker,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad,
+                  ops::TransposeGradInferVarType);
 
 REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
                   ops::Transpose2GradMaker<paddle::framework::OpDesc>,
                   ops::Transpose2GradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad,
+                  ops::TransposeGradInferVarType,
                   ops::Transpose2DoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Transpose2DoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
deleted file mode 100644
index 02e224549a5..00000000000
--- a/paddle/fluid/operators/transpose_op.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/transpose_op.cu.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TransposeGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.InputVar("X");
-    auto* out = context.OutputVar("Out");
-
-    const framework::Tensor* x_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*x);
-    framework::Tensor* out_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    if (out_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransposeGPUKernelDriver<T>(dev_ctx, ndims, *x_tensor, axis, out_tensor);
-  }
-};
-template <typename DeviceContext, typename T>
-class TransposeGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
-    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
-    if (!x_grad) {
-      return;
-    }
-
-    const framework::Tensor* out_grad_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
-    framework::Tensor* x_grad_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
-
-    x_grad_tensor->mutable_data<T>(context.GetPlace());
-    if (x_grad_tensor->numel() == 0) {
-      return;
-    }
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransposeGPUKernelDriver<T>(dev_ctx, ndims, *out_grad_tensor, reversed_axis,
-                                x_grad_tensor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose_grad,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::bfloat16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<float>>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose2,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::bfloat16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<float>>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index b542fa37f88..a31ac28c991 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,8 +16,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -258,10 +259,10 @@ struct SystemElemType<16> {
 };
 
 template <typename T, int tile_long, int tile_short>
-void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
-                                      int tile_size_i, int tile_size_j,
-                                      int total_tiles_count, const T* input,
-                                      const Dim3& input_dims, T* output) {
+void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
+                                      int tile_size_j, int total_tiles_count,
+                                      const T* input, const Dim3& input_dims,
+                                      T* output) {
   constexpr int NumThreads = tile_long;
   if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
     TilingSwapDim1And2<
@@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
 
 template <typename T, int tile_long, int tile_short, typename dummy = void>
 struct NarrowDims2TransposeDispatch {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch<
     T, tile_long, tile_short,
     typename std::enable_if<
         CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch<
     T, tile_long, tile_short,
     typename std::enable_if<CheckLongTileSize(tile_long, tile_short, sizeof(T)),
                             void>::type> {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch<
 };
 
 template <typename T, bool conjugate = false>
-void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input,
+void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
                           const Dim3& input_dims, T* output,
                           const int kMinTileSize) {
   // First get available tile sizes for the data type requested as backups
@@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
 
 // Here suppose convert all tensor to dim3, so just change dim1 and 2.
 template <typename T>
-void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
-                                 const T* input, const Dim3& input_dims,
-                                 T* output) {
+void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
+                                 const Dim3& input_dims, T* output) {
   // Suppose tile size > 16
   static const int kMinTileSize = 16;
   static const int kMinNarrowTileSize = 96;
@@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
   } else {
     // If input shape is small, such as 8X8, just do simple copy
     int total_elements = input_dims[0] * input_dims[1] * input_dims[2];
-    auto config = GetGpuLaunchConfig1D(d, total_elements);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements);
     TransposeSimpleKernel<T, 0, 2, 1><<<
         config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
         total_elements, input, input_dims, output);
@@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
 
 template <typename T>
 struct SwapDim1And2InTranspose {
-  typedef platform::CUDADeviceContext Device;
+  typedef phi::GPUContext Device;
   void operator()(const Device& d, const T* in,
                   const std::vector<int>& combined_dims, T* out) {
     Dim3 input_dims = {static_cast<int>(combined_dims[0]),
@@ -533,7 +533,7 @@ struct SwapDim1And2InTranspose {
 
 template <typename T>
 struct SwapDim0And2InTranspose {
-  typedef platform::CUDADeviceContext Device;
+  typedef phi::GPUContext Device;
   void operator()(const Device& d, const T* in,
                   const std::vector<int>& combined_dims, T* out) {
     Dim3 input_dims = {static_cast<int>(combined_dims[0]),
@@ -541,7 +541,7 @@ struct SwapDim0And2InTranspose {
                        static_cast<int>(combined_dims[2])};
 
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
-    auto config = GetGpuLaunchConfig1D(d, total_size);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size);
 
     TransposeSimpleKernel<T, 2, 1, 0><<<
         config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
@@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
 
 template <typename T>
 struct TransposeSimple {
-  static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in,
+  static bool run(const phi::GPUContext& ctx, const Tensor& in,
                   const std::vector<int32_t> perm, Tensor* out) {
     // First reduce the dimensions of the input tensor if possible.
     std::vector<int> new_perm;
@@ -654,12 +654,12 @@ struct TransposeSimple {
 };
 
 template <typename T>
-void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              const int ndims, const Tensor& in,
-                              const std::vector<int32_t> perm, Tensor* out) {
+void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims,
+                              const Tensor& in,
+                              const std::vector<int32_t>& perm, Tensor* out) {
   auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
   if (!ret) {
-    TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, in, out, perm);
+    TransCompute<phi::GPUContext, T>(ndims, dev_ctx, in, out, perm);
   }
 }
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index ec05a534c0e..a9e4876cc82 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -59,63 +59,5 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
   }
 }
 
-template <typename DeviceContext, typename T>
-class TransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.InputVar("X");
-    auto* out = context.OutputVar("Out");
-
-    const framework::Tensor* x_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*x);
-    framework::Tensor* out_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    if (out_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
-    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
-
-    if (!x_grad) {
-      return;
-    }
-    const framework::Tensor* out_grad_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
-    framework::Tensor* x_grad_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
-
-    x_grad_tensor->mutable_data<T>(context.GetPlace());
-    if (x_grad_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
-                                   x_grad_tensor, reversed_axis);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index cce3f188c8b..5617d728a51 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(transpose2);
+USE_OP_ITSELF(transpose2);
 USE_OP_DEVICE_KERNEL(transpose2, NPU);
 
 template <typename T>
diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
new file mode 100644
index 00000000000..9dbcf575f33
--- /dev/null
+++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
new file mode 100644
index 00000000000..a80196e7f80
--- /dev/null
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_kernel.h"
+#include <vector>
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+  int rank = axis.size();
+  switch (rank) {
+    case 1:
+      funcs::Transpose<Context, T, 1> trans1;
+      trans1(ctx, x, out, axis);
+      break;
+    case 2:
+      funcs::Transpose<Context, T, 2> trans2;
+      trans2(ctx, x, out, axis);
+      break;
+    case 3:
+      funcs::Transpose<Context, T, 3> trans3;
+      trans3(ctx, x, out, axis);
+      break;
+    case 4:
+      funcs::Transpose<Context, T, 4> trans4;
+      trans4(ctx, x, out, axis);
+      break;
+    case 5:
+      funcs::Transpose<Context, T, 5> trans5;
+      trans5(ctx, x, out, axis);
+      break;
+    case 6:
+      funcs::Transpose<Context, T, 6> trans6;
+      trans6(ctx, x, out, axis);
+      break;
+    default:
+      // for rank >= 7 situation
+      funcs::TransposeNormal<Context, T> trans_normal;
+      trans_normal(ctx, x, out, axis);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index ae368a005f0..df2af82d551 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -187,6 +187,57 @@ void TransposeNormal<DeviceContext, T>::operator()(
       in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank);
 }
 
+template <typename T>
+struct TransposeNormal<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& in,
+                  DenseTensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = stride(in.dims());
+    auto out_stride = stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const phi::GPUPlace& cuda_place = context.GetPlace();
+    phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
+    auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    paddle::memory::Copy(
+        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr,
+        out_ptr,
+        elements,
+        in_stride_ptr,
+        out_stride_ptr,
+        axis_ptr,
+        rank);
+  }
+};
+
 // define transpose normal
 #define DEFINE_GPU_TRANS_NORMAL(TYPE)                                         \
   template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
new file mode 100644
index 00000000000..0687dc0c200
--- /dev/null
+++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
new file mode 100644
index 00000000000..9ea2af292cc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/fluid/framework/gpu_utils.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+namespace phi {
+template <typename T, typename Context>
+void TransposeKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  int rank = axis.size();
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+  paddle::operators::TransposeGPUKernelDriver<T>(ctx, rank, x, axis, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
new file mode 100644
index 00000000000..6bb555fe28f
--- /dev/null
+++ b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
+  std::vector<int> reversed_axis(axis);
+
+  dev_ctx.template Alloc<T>(x_grad);
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
+
+  TransposeKernel<T, Context>(dev_ctx, out_grad, reversed_axis, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h
new file mode 100644
index 00000000000..33d4ca7e3c6
--- /dev/null
+++ b/paddle/phi/kernels/transpose_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
new file mode 100644
index 00000000000..303b4a9a8f0
--- /dev/null
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/phi/ops/compat/transpose_sig.cc
new file mode 100644
index 00000000000..90961760cfc
--- /dev/null
+++ b/paddle/phi/ops/compat/transpose_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TransposeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("transpose", {"X"}, {"axis"}, {"Out"});
+}
+
+KernelSignature TransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "transpose_grad", {GradVarName("Out")}, {"axis"}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(transpose2, transpose);
+PD_REGISTER_BASE_KERNEL_NAME(transpose2_grad, transpose_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(transpose2, phi::TransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose2_grad,
+                           phi::TransposeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose, phi::TransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose_grad, phi::TransposeGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 2a8f72c2170..2633a599256 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -43,7 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   get_data_from_feeder=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
-                                  use_ir_memory_optimize=True,
+                                  use_ir_memory_optimize=False,
                                   enable_inplace=True,
                                   fuse_elewise_add_act_ops=False,
                                   fuse_all_optimizer_ops=False,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index d54194164a5..110bb961bbe 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -207,4 +207,5 @@ class TestDygraphSimpleNet(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 1cb39eb131b..b87e8d4e3c2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -206,4 +206,5 @@ class TestTransformer(TestParallelExecutorBase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
index 1661f753a84..15d9e0e2daa 100644
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -14,10 +14,12 @@
 
 import unittest
 import paddle.fluid as fluid
+import paddle
 
 fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
 
 from test_parallel_executor_transformer import TestTransformer
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 13b880b28bf..1e6b4354dd9 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -463,4 +463,5 @@ class TestMoveAxis(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From 66196573ffe73bd3e02a4f713e2b2578bbf601aa Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 2 Mar 2022 17:50:32 +0800
Subject: [PATCH 062/272] [XPU] Fix Phi Kernel cache problem in operator.cc
 (#40044)

* [XPU] Fix Phi Kernel cache problem in operator.cc

* fix typo
---
 paddle/fluid/framework/operator.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b91ee3c2d63..ffdc3e6d3c2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1210,6 +1210,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
                 << "` not found.";
       }
+    } else {
+      pt_kernel_name = pt_kernel_signature_->name;
+      pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
     }
 #ifdef PADDLE_WITH_XPU
     bool is_xpu_unsupport =
-- 
GitLab


From 5898e9abecc05bc039e29838ec4b8fb49ae2d3f0 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 2 Mar 2022 18:25:54 +0800
Subject: [PATCH 063/272] [Phi]Move elementwise function to funcs directory 
 (#39986)

* move elementwise function to funcs directory

* fix compile bugs

* modify according to comment
---
 .../elementwise/elementwise_add_op.kps        |   2 +-
 .../elementwise/elementwise_op_broadcast.cu.h |   3 -
 .../elementwise/elementwise_op_function.h     |  29 +-
 .../elementwise/elementwise_op_impl.cu.h      |   2 +-
 paddle/fluid/operators/viterbi_decode_op.h    |  12 +-
 paddle/phi/kernels/cpu/elementwise.h          | 619 +----------------
 paddle/phi/kernels/cpu/elementwise_grad.h     | 146 ++++
 .../kernels/cpu/elementwise_grad_kernel.cc    |  27 +-
 paddle/phi/kernels/cpu/logical_kernel.cc      |  20 +-
 paddle/phi/kernels/cpu/math_kernel.cc         |   9 +-
 paddle/phi/kernels/funcs/broadcast_function.h |  18 +-
 paddle/phi/kernels/funcs/elementwise_base.h   | 285 ++++----
 .../elementwise_grad_base.h}                  | 655 +++++++++++-------
 paddle/phi/kernels/funcs/elementwise_utils.h  | 121 ++++
 paddle/phi/kernels/gpu/elementwise_grad.h     | 246 +++++++
 .../kernels/gpu/elementwise_grad_kernel.cu    |  27 +-
 paddle/phi/kernels/gpu/logical_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/math_kernel.cu         |   2 +-
 .../impl/elementwise_grad_kernel_impl.h       |  33 +-
 19 files changed, 1149 insertions(+), 1110 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/elementwise_grad.h
 rename paddle/phi/kernels/{gpu/elementwise.h => funcs/elementwise_grad_base.h} (78%)
 create mode 100644 paddle/phi/kernels/funcs/elementwise_utils.h
 create mode 100644 paddle/phi/kernels/gpu/elementwise_grad.h

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
index d6e0749318e..3b7457d72e1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #else
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #endif
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 418779c32e8..102127e6ffe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -16,9 +16,6 @@
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/phi/kernels/gpu/elementwise.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index a1a7f831098..61862aa9f87 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,6 +31,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -133,7 +134,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
 
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
-  return phi::funcs::trim_trailing_singular_dims(dims);
+  return phi::funcs::TrimTrailingSingularDims(dims);
 }
 
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
@@ -152,7 +153,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                                                Tout>(
         dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
-    phi::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+    phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
         dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   }
 }
@@ -173,19 +174,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                           const framework::Tensor *y, int axis, Functor func,
                           framework::Tensor *z) {
   z->mutable_data<OutType>(ctx.GetPlace());
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func,
-                                                 z);
-
-#endif
-    return;
-  }
-  const auto &dev_ctx =
-      ctx.template device_context<platform::CPUDeviceContext>();
-  phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func, z);
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+  phi::funcs::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis,
+                                                      func, z);
 }
 
 // FusedElemwiseAndAct
@@ -443,8 +434,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                           &is_run_common_broadcast);
+  phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
+                         &is_run_common_broadcast);
   if (post == 1) {
     int h = pre;
     int w = n;
@@ -991,8 +982,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                           &is_run_common_broadcast);
+  phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
+                         &is_run_common_broadcast);
   const T *x_data = nullptr;
   const T *y_data = nullptr;
   if (x->IsInitialized()) x_data = x->data<T>();
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 7d7bb4f26fc..f49e2ab4e17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index 8f01a0c3604..bf12a03e7b4 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -151,12 +151,12 @@ struct GetInputIndex<false> {
                   const std::vector<int>& output_strides, int output_idx,
                   int* index_array, int* lhs_idx, int* rhs_idx) {
     int out_dims_size = output_strides.size();
-    *lhs_idx =
-        phi::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array);
-    *rhs_idx =
-        phi::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array);
-    phi::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                     index_array);
+    *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
+                                               index_array);
+    *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
+                                               index_array);
+    phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
+                                            index_array);
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index 28bf5ab743f..0f67df66113 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -189,250 +189,6 @@ struct SameDimsMultiplyFunctor<
   }
 };
 
-inline void UpdateElementwiseIndexArray(const int* out_dims_array,
-                                        const int max_dim,
-                                        int* index_array) {
-  for (int i = max_dim - 1; i >= 0; --i) {
-    ++index_array[i];
-    if (index_array[i] >= out_dims_array[i]) {
-      index_array[i] -= out_dims_array[i];
-    } else {
-      break;
-    }
-  }
-}
-
-inline int GetElementwiseIndex(const int* x_dims_array,
-                               const int max_dim,
-                               const int* index_array) {
-  int index_ = 0;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] > 1) {
-      index_ = index_ * x_dims_array[i] + index_array[i];
-    }
-  }
-  return index_;
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonGradBroadcastCPU(const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& out,
-                            const DenseTensor& dout,
-                            DenseTensor* dx,
-                            DenseTensor* dy,
-                            int* x_dims_array,
-                            int* y_dims_array,
-                            int* out_dims_array,
-                            int max_dim,
-                            const CPUContext& ctx,
-                            DX_OP dx_op,
-                            DY_OP dy_op) {
-  std::vector<int> index_array(max_dim, 0);
-  const T* x_data = x.data<T>();
-  const T* y_data = y.data<T>();
-  const Tout* out_data = out.data<Tout>();
-  const Tout* dout_data = dout.data<Tout>();
-  T* dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
-  T* dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
-  if (dx_data != nullptr) {
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-  }
-  if (dy_data != nullptr) {
-    memset(dy_data, 0, dy->numel() * sizeof(T));
-  }
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (dx_data != nullptr) {
-      dx_data[x_index] += dx_op(x_data[x_index],
-                                y_data[y_index],
-                                out_data[out_index],
-                                dout_data[out_index]);
-    }
-    if (dy_data != nullptr) {
-      dy_data[y_index] += dy_op(x_data[x_index],
-                                y_data[y_index],
-                                out_data[out_index],
-                                dout_data[out_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const DenseTensor& x,
-                               const DenseTensor& y,
-                               DenseTensor* z,
-                               int* x_dims_array,
-                               int* y_dims_array,
-                               int* out_dims_array,
-                               int max_dim,
-                               const CPUContext& ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  std::vector<int> index_array(max_dim, 0);
-  const T* x_data = x.data<T>();
-  const T* y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(
-      x_data, phi::errors::InvalidArgument("The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(
-      y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
-  OutType* out_data = ctx.Alloc<OutType>(z);
-
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (is_xsize_larger) {
-      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
-    } else {
-      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
-                                       const DenseTensor& x,
-                                       const DenseTensor& y,
-                                       DenseTensor* z,
-                                       const DDim& x_dims,
-                                       const DDim& y_dims,
-                                       Functor func,
-                                       int axis,
-                                       const bool is_xsize_larger = true) {
-  int max_dim = (std::max)(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
-
-  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
-                                                 y,
-                                                 z,
-                                                 x_dims_array.data(),
-                                                 y_dims_array.data(),
-                                                 out_dims_array.data(),
-                                                 max_dim,
-                                                 dev_ctx,
-                                                 func,
-                                                 is_xsize_larger);
-}
-
-// It is a common CPU implementation to compute binary calculation with the
-// support of broadcast. Note:
-// 1. CPU implementation cannot support the case when x needs broadcast, thus
-//    this function need to be called with XxxFunctor and XxxInverseFunctor,
-//    like AddFunctor and InverseAddFunctor.
-// 2. The corresponding GPU implementation supports all the broadcast cases,
-//    thus there is no need to define and call with XxxInverseFunctor.
-// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
-// cases and avoid the need of XxxInverseFunctor.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const CPUContext& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        Functor func,
-                        DenseTensor* z) {
-  dev_ctx.Alloc<OutType>(z);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-  funcs::TransformFunctor<Functor, T, CPUContext, OutType> functor(
-      x, y, z, dev_ctx, func, is_xsize_larger);
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  }
-  // special case for common implementation.
-  // case 1: x=[2,3,1,5], y=[2,1,4,1]
-  // case 2: x=[2,3,4], y=[1,1,4]
-  if (is_run_common_broadcast == 1) {
-    CommonElementwiseBroadcastForward<Functor, T, OutType>(
-        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
-    return;
-  }
-
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
 template <typename Functor>
 struct SameDimsElementwiseCompute {
   void operator()(const CPUContext& dev_ctx,
@@ -443,377 +199,4 @@ struct SameDimsElementwiseCompute {
   }
 };
 
-// BACKWARD CODE
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast1CPU(const T* x,
-                                      const T* y,
-                                      const Tout* out,
-                                      const Tout* dout,
-                                      int h,
-                                      int w,
-                                      bool is_xsize_larger,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op,
-                                      T* dx,
-                                      T* dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int x_offset = i * w + j;
-        if (dx != nullptr) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-        }
-        if (dy != nullptr) {
-          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          if (i == 0) {
-            dy[j] = tmp;
-          } else {
-            dy[j] += tmp;
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int y_offset = i * w + j;
-        if (dy != nullptr) {
-          dy[y_offset] =
-              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-        }
-        if (dx != nullptr) {
-          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          if (i == 0) {
-            dx[j] = tmp;
-          } else {
-            dx[j] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast2CPU(const T* x,
-                                      const T* y,
-                                      const Tout* out,
-                                      const Tout* dout,
-                                      int pre,
-                                      int n,
-                                      int post,
-                                      bool is_xsize_larger,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op,
-                                      T* dx,
-                                      T* dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int x_offset = i * n * post + j * post + k;
-          if (dx != nullptr) {
-            dx[x_offset] =
-                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          }
-          if (dy != nullptr) {
-            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-            if (i == 0 && k == 0) {
-              dy[j] = tmp;
-            } else {
-              dy[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int y_offset = i * n * post + j * post + k;
-          if (dy != nullptr) {
-            dy[y_offset] =
-                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          }
-          if (dx != nullptr) {
-            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-            if (i == 0 && k == 0) {
-              dx[j] = tmp;
-            } else {
-              dx[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
-                                        const DDim& x_dims,
-                                        const DDim& y_dims,
-                                        const DenseTensor& x,
-                                        const DenseTensor& y,
-                                        const DenseTensor& out,
-                                        const DenseTensor& dout,
-                                        int axis,
-                                        DenseTensor* dx,
-                                        DenseTensor* dy,
-                                        DX_OP dx_op,
-                                        DY_OP dy_op) {
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
-  // for inplace strategy. memset will make dx and dout clear and get wrong
-  // result.
-  if (dx && dx->IsSharedBufferWith(dout)) {
-    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
-  }
-
-  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << phi::make_ddim(x_dims_array)
-          << " ydim:" << phi::make_ddim(y_dims_array);
-
-  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
-                                                y,
-                                                out,
-                                                dout,
-                                                dx,
-                                                dy,
-                                                x_dims_array.data(),
-                                                y_dims_array.data(),
-                                                out_dims_array.data(),
-                                                max_dim,
-                                                ctx,
-                                                dx_op,
-                                                dy_op);
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
-                                      const DDim& x_dims,
-                                      const DDim& y_dims,
-                                      const DenseTensor& x,
-                                      const DenseTensor& y,
-                                      const DenseTensor& out,
-                                      const DenseTensor& dout,
-                                      int axis,
-                                      DenseTensor* dx,
-                                      DenseTensor* dy,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op) {
-  bool is_xsize_larger = true;
-
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  }
-  // special case for common backward implementation.
-  if (is_run_common_broadcast) {
-    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
-        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    return;
-  }
-  if (post == 1) {
-    ElemwiseGradBroadcast1CPU(x.data<T>(),
-                              y.data<T>(),
-                              out.data<Tout>(),
-                              dout.data<Tout>(),
-                              pre,
-                              n,
-                              is_xsize_larger,
-                              dx_op,
-                              dy_op,
-                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
-                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
-  } else {
-    ElemwiseGradBroadcast2CPU(x.data<T>(),
-                              y.data<T>(),
-                              out.data<Tout>(),
-                              dout.data<Tout>(),
-                              pre,
-                              n,
-                              post,
-                              is_xsize_larger,
-                              dx_op,
-                              dy_op,
-                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
-                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
-  }
-}
-
-// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
-// explicit gradient can cut off X, Y, Out from gradient op
-// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
-// elementwise code.
-template <typename T, typename DX_OP, typename DY_OP>
-void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 const DenseTensor& out,
-                                 const DenseTensor& dout,
-                                 int axis,
-                                 DenseTensor* dx,
-                                 DenseTensor* dy,
-                                 DX_OP dx_op,
-                                 DY_OP dy_op) {
-  const DDim& x_dim = x.dims();
-  const DDim& y_dim = y.dims();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(
-        dev_ctx,
-        x_dim,
-        y_dim,
-        dout,
-        dout,
-        out,
-        dout,
-        axis,
-        dx,
-        dy,
-        dx_op,
-        dy_op);
-  } else {
-    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
-                                                      x_dim,
-                                                      y_dim,
-                                                      dout,
-                                                      dout,
-                                                      out,
-                                                      dout,
-                                                      axis,
-                                                      dx,
-                                                      dy,
-                                                      dx_op,
-                                                      dy_op);
-  }
-}
-
-/*
-******************************
-    Add Grad
-******************************
-*/
-template <typename T>
-struct IdentityGrad {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_add_grad(const CPUContext& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     const DenseTensor& out,
-                     const DenseTensor& dout,
-                     DenseTensor* dx,
-                     DenseTensor* dy,
-                     int axis = -1) {
-  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
-  if (dx) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
-  }
-
-  if (dy) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
-  }
-}
-
-template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_add_grad(const CPUContext& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     const DenseTensor& out,
-                     const DenseTensor& dout,
-                     DenseTensor* dx,
-                     DenseTensor* dy,
-                     int axis = -1) {
-  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
-}
-
-/*
-******************************
-    Sub Grad
-******************************
-*/
-
-template <typename T>
-struct SubGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-template <typename T>
-void elementwise_sub_grad(const CPUContext& ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          const DenseTensor& out,
-                          const DenseTensor& dout,
-                          DenseTensor* dx,
-                          DenseTensor* dy,
-                          int axis = -1) {
-  ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
-      ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
-}
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h
new file mode 100644
index 00000000000..92587566eb8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_grad.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+
+namespace phi {
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& out,
+                                 const DenseTensor& dout,
+                                 int axis,
+                                 DenseTensor* dx,
+                                 DenseTensor* dy,
+                                 DX_OP dx_op,
+                                 DY_OP dy_op) {
+  const DDim& x_dim = x.dims();
+  const DDim& y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(dev_ctx,
+                                                                       x_dim,
+                                                                       y_dim,
+                                                                       dout,
+                                                                       dout,
+                                                                       out,
+                                                                       dout,
+                                                                       axis,
+                                                                       dx,
+                                                                       dy,
+                                                                       dx_op,
+                                                                       dy_op);
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
+                                                             x_dim,
+                                                             y_dim,
+                                                             dout,
+                                                             dout,
+                                                             out,
+                                                             dout,
+                                                             axis,
+                                                             dx,
+                                                             dy,
+                                                             dx_op,
+                                                             dy_op);
+  }
+}
+
+/*
+******************************
+    Add Grad
+******************************
+*/
+template <typename T>
+struct IdentityGrad {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+ElementwiseAddGrad(const CPUContext& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy,
+                   int axis = -1) {
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
+  if (dx) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value>::type
+ElementwiseAddGrad(const CPUContext& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy,
+                   int axis = -1) {
+  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
+}
+
+/*
+******************************
+    Sub Grad
+******************************
+*/
+
+template <typename T>
+struct SubGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+struct SubGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
+};
+
+template <typename T>
+void ElementwiseSubGrad(const CPUContext& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& out,
+                        const DenseTensor& dout,
+                        DenseTensor* dx,
+                        DenseTensor* dy,
+                        int axis = -1) {
+  ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index c878e8133ff..e48ee805959 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -17,7 +17,8 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
@@ -33,7 +34,7 @@ void AddGradFunc(const CPUContext& dev_ctx,
                  DenseTensor* dy,
                  int axis = -1) {
   if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
   } else {
     ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
         dev_ctx,
@@ -68,15 +69,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx,
-                            y,
-                            ddx,
-                            ddy,
-                            dout,
-                            axis,
-                            ddout,
-                            ElementwiseCompute<funcs::AddFunctor<T>, T>,
-                            ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -101,7 +94,7 @@ void SubtractGradKernel(const Context& dev_ctx,
                         DenseTensor* dy) {
   // skip out
   auto* out = &dout;
-  elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
 }
 
 template <typename T, typename Context>
@@ -112,15 +105,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(
-      dev_ctx,
-      y,
-      ddx,
-      ddy,
-      dout,
-      axis,
-      ddout,
-      ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
index 3d179e1e75f..a0747b128e5 100644
--- a/paddle/phi/kernels/cpu/logical_kernel.cc
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/logical_functor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -24,15 +24,15 @@
 
 namespace phi {
 
-#define DEFINE_LOGICAL_BINARY_KERNEL(type)                         \
-  template <typename T, typename Context>                          \
-  void Logical##type##Kernel(const Context& dev_ctx,               \
-                             const DenseTensor& x,                 \
-                             const DenseTensor& y,                 \
-                             DenseTensor* out) {                   \
-    funcs::Logical##type##Functor<T> binary_func;                  \
-    ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
-        dev_ctx, x, y, -1, binary_func, out);                      \
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                                \
+  template <typename T, typename Context>                                 \
+  void Logical##type##Kernel(const Context& dev_ctx,                      \
+                             const DenseTensor& x,                        \
+                             const DenseTensor& y,                        \
+                             DenseTensor* out) {                          \
+    funcs::Logical##type##Functor<T> binary_func;                         \
+    funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
+        dev_ctx, x, y, -1, binary_func, out);                             \
   }
 
 DEFINE_LOGICAL_BINARY_KERNEL(And)
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 5cfcfe62c78..250f656926c 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -20,6 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
@@ -45,10 +46,10 @@ namespace phi {
       auto x_dims = x.dims();                                               \
       auto y_dims = y.dims();                                               \
       if (x_dims.size() >= y_dims.size()) {                                 \
-        ElementwiseCompute<funcs::name##Functor<T>, T>(                     \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
             dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
       } else {                                                              \
-        ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(            \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
             dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
       }                                                                     \
     }                                                                       \
@@ -93,10 +94,10 @@ void DivideRawKernel(const Context& dev_ctx,
     auto x_dims = x.dims();
     auto y_dims = y.dims();
     if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
           dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
     } else {
-      ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
           dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
     }
   }
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 84a36b849af..e9fd4cf47b8 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -25,6 +25,8 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+
 struct DimensionsTransform {
   using DimVector = std::vector<int64_t>;
   typedef void (*MergeFunctor)(
@@ -183,8 +185,6 @@ struct DimensionsTransform {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
@@ -578,6 +578,20 @@ void BroadcastKernel(const KPDevice &ctx,
   }
 }
 
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  std::vector<const DenseTensor *> ins = {&x, &y};
+  std::vector<DenseTensor *> outs = {z};
+  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
+      dev_ctx, ins, &outs, axis, func);
+}
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index d369781f845..235dbdd40f6 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
@@ -44,28 +45,6 @@ using ConditionalT =
 namespace funcs {
 using DDim = phi::DDim;
 
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-struct ElemwiseGradNoBroadcast {
-  const T *x_;
-  const T *y_;
-  const Tout *out_;
-  const Tout *dout_;
-
-  HOSTDEVICE void operator()(size_t i) {
-    if (dx_ != nullptr) {
-      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-    if (dy_ != nullptr) {
-      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-  }
-
-  DX_OP dx_op_;
-  DY_OP dy_op_;
-  T *dx_;
-  T *dy_;
-};
-
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 
@@ -293,73 +272,172 @@ class TransformFunctor {
   bool is_xsize_larger_;
 };
 
-inline DDim trim_trailing_singular_dims(const DDim &dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
-  }
-  if (actual_dims_size == dims.size()) return dims;
-  std::vector<int> trim_dims;
-  trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
-    trim_dims[i] = dims[i];
-  }
-  if (trim_dims.size() == 0) {
-    return DDim(phi::make_dim());
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const DenseTensor &x,
+                               const DenseTensor &y,
+                               DenseTensor *z,
+                               int *x_dims_array,
+                               int *y_dims_array,
+                               int *out_dims_array,
+                               int max_dim,
+                               const CPUContext &ctx,
+                               Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  PADDLE_ENFORCE_NOT_NULL(
+      x_data, errors::InvalidArgument("The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      y_data, errors::InvalidArgument("The input Y should not be empty."));
+  OutType *out_data = ctx.Alloc<OutType>(z);
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
   }
-  DDim actual_dims = phi::make_ddim(trim_dims);
-  return actual_dims;
 }
 
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
- *
- * New parameter: *is_run_common_broadcast* is a flag to record whether to run
- * common broadcast code.
- */
-inline void get_mid_dims(const DDim &x_dims,
-                         const DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post,
-                         int *is_run_common_broadcast) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  *is_run_common_broadcast = 0;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    if (x_dims[i + axis] != y_dims[i]) {
-      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
-                        true,
-                        phi::errors::InvalidArgument(
-                            "Broadcast dimension mismatch. Operands "
-                            "could not be broadcast together with the shape of "
-                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
-                            "in X is not equal to [%d] in Y.",
-                            x_dims,
-                            y_dims,
-                            x_dims[i + axis],
-                            y_dims[i]));
-      *is_run_common_broadcast = 1;
-      return;
-    }
-    (*n) *= y_dims[i];
-  }
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx,
+                                       const DenseTensor &x,
+                                       const DenseTensor &y,
+                                       DenseTensor *z,
+                                       const DDim &x_dims,
+                                       const DDim &y_dims,
+                                       Functor func,
+                                       int axis,
+                                       const bool is_xsize_larger = true) {
+  int max_dim = (std::max)(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      phi::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    phi::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+
+  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
+                                                 y,
+                                                 z,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 dev_ctx,
+                                                 func,
+                                                 is_xsize_larger);
+}
+
+// It is a common CPU implementation to compute binary calculation with the
+// support of broadcast. Note:
+// 1. CPU implementation cannot support the case when x needs broadcast, thus
+//    this function need to be called with XxxFunctor and XxxInverseFunctor,
+//    like AddFunctor and InverseAddFunctor.
+// 2. The corresponding GPU implementation supports all the broadcast cases,
+//    thus there is no need to define and call with XxxInverseFunctor.
+// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
+// cases and avoid the need of XxxInverseFunctor.
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const CPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  dev_ctx.Alloc<OutType>(z);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+  TransformFunctor<Functor, T, CPUContext, OutType> functor(
+      x, y, z, dev_ctx, func, is_xsize_larger);
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  if (is_run_common_broadcast == 1) {
+    CommonElementwiseBroadcastForward<Functor, T, OutType>(
+        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+    return;
+  }
+
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
   }
 }
 
@@ -395,41 +473,11 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
     auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
     *ddx_safe = phi::Empty(dev_ctx, std::move(meta));
     ddx_safe->mutable_data(dev_ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
   }
 }
 
-template <typename DeviceContext,
-          typename T,
-          typename DX_OP,
-          typename DY_OP,
-          typename Tout = T>
-void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
-                                    const DDim &x_dim,
-                                    const DDim &y_dim,
-                                    const DenseTensor &x,
-                                    const DenseTensor &y,
-                                    const DenseTensor &out,
-                                    const DenseTensor &dout,
-                                    int axis,
-                                    DenseTensor *dx,
-                                    DenseTensor *dy,
-                                    DX_OP dx_op,
-                                    DY_OP dy_op) {
-  size_t N = static_cast<size_t>(phi::product(x_dim));
-  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
-  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
-      x.data<T>(),
-      y.data<T>(),
-      out.data<Tout>(),
-      dout.data<Tout>(),
-      dx_op,
-      dy_op,
-      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
-      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
-}
-
 inline void ElementwiseGradPreProcess(const DenseTensor &dout,
                                       DenseTensor *dx) {
   if (dx != nullptr) {
@@ -806,6 +854,7 @@ void ElementwiseKernel(const KPDevice &ctx,
     }
   }
 }
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
similarity index 78%
rename from paddle/phi/kernels/gpu/elementwise.h
rename to paddle/phi/kernels/funcs/elementwise_grad_base.h
index 12cafc7023b..dff0cfe5b8b 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -14,16 +14,25 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+#endif
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
 #else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
+
 #define BLOCK_X 32
 #define BLOCK_Y 32
 
@@ -36,21 +45,361 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 
 namespace phi {
 
-// General binary elementwise comutaion with the support of broadcast.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const GPUContext &dev_ctx,
-                        const DenseTensor &x,
-                        const DenseTensor &y,
-                        int axis,
-                        Functor func,
-                        DenseTensor *z) {
-  std::vector<const DenseTensor *> ins = {&x, &y};
-  std::vector<DenseTensor *> outs = {z};
-  z->mutable_data<OutType>(dev_ctx.GetPlace());
-  phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, OutType>(
-      dev_ctx, ins, &outs, axis, func);
+namespace funcs {
+using DDim = phi::DDim;
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonGradBroadcastCPU(const DenseTensor &x,
+                            const DenseTensor &y,
+                            const DenseTensor &out,
+                            const DenseTensor &dout,
+                            DenseTensor *dx,
+                            DenseTensor *dy,
+                            int *x_dims_array,
+                            int *y_dims_array,
+                            int *out_dims_array,
+                            int max_dim,
+                            const CPUContext &ctx,
+                            DX_OP dx_op,
+                            DY_OP dy_op) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  const Tout *out_data = out.data<Tout>();
+  const Tout *dout_data = dout.data<Tout>();
+  T *dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
+  T *dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
+  if (dx_data != nullptr) {
+    memset(dx_data, 0, dx->numel() * sizeof(T));
+  }
+  if (dy_data != nullptr) {
+    memset(dy_data, 0, dy->numel() * sizeof(T));
+  }
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (dx_data != nullptr) {
+      dx_data[x_index] += dx_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+    if (dy_data != nullptr) {
+      dy_data[y_index] += dy_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast1CPU(const T *x,
+                                      const T *y,
+                                      const Tout *out,
+                                      const Tout *dout,
+                                      int h,
+                                      int w,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T *dx,
+                                      T *dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int x_offset = i * w + j;
+        if (dx != nullptr) {
+          dx[x_offset] =
+              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+        }
+        if (dy != nullptr) {
+          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          if (i == 0) {
+            dy[j] = tmp;
+          } else {
+            dy[j] += tmp;
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int y_offset = i * w + j;
+        if (dy != nullptr) {
+          dy[y_offset] =
+              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+        }
+        if (dx != nullptr) {
+          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          if (i == 0) {
+            dx[j] = tmp;
+          } else {
+            dx[j] += tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast2CPU(const T *x,
+                                      const T *y,
+                                      const Tout *out,
+                                      const Tout *dout,
+                                      int pre,
+                                      int n,
+                                      int post,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T *dx,
+                                      T *dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int x_offset = i * n * post + j * post + k;
+          if (dx != nullptr) {
+            dx[x_offset] =
+                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          }
+          if (dy != nullptr) {
+            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+            if (i == 0 && k == 0) {
+              dy[j] = tmp;
+            } else {
+              dy[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int y_offset = i * n * post + j * post + k;
+          if (dy != nullptr) {
+            dy[y_offset] =
+                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          }
+          if (dx != nullptr) {
+            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+            if (i == 0 && k == 0) {
+              dx[j] = tmp;
+            } else {
+              dx[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
+                                        const DDim &x_dims,
+                                        const DDim &y_dims,
+                                        const DenseTensor &x,
+                                        const DenseTensor &y,
+                                        const DenseTensor &out,
+                                        const DenseTensor &dout,
+                                        int axis,
+                                        DenseTensor *dx,
+                                        DenseTensor *dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << phi::make_ddim(x_dims_array)
+          << " ydim:" << phi::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
+                                                y,
+                                                out,
+                                                dout,
+                                                dx,
+                                                dy,
+                                                x_dims_array.data(),
+                                                y_dims_array.data(),
+                                                out_dims_array.data(),
+                                                max_dim,
+                                                ctx,
+                                                dx_op,
+                                                dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx,
+                                      const DDim &x_dims,
+                                      const DDim &y_dims,
+                                      const DenseTensor &x,
+                                      const DenseTensor &y,
+                                      const DenseTensor &out,
+                                      const DenseTensor &dout,
+                                      int axis,
+                                      DenseTensor *dx,
+                                      DenseTensor *dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
+  } else {
+    ElemwiseGradBroadcast2CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              post,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+struct ElemwiseGradNoBroadcast {
+  const T *x_;
+  const T *y_;
+  const Tout *out_;
+  const Tout *dout_;
+
+  HOSTDEVICE void operator()(size_t i) {
+    if (dx_ != nullptr) {
+      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+    if (dy_ != nullptr) {
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+  }
+
+  DX_OP dx_op_;
+  DY_OP dy_op_;
+  T *dx_;
+  T *dy_;
+};
+
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
+                                    const DDim &x_dim,
+                                    const DDim &y_dim,
+                                    const DenseTensor &x,
+                                    const DenseTensor &y,
+                                    const DenseTensor &out,
+                                    const DenseTensor &dout,
+                                    int axis,
+                                    DenseTensor *dx,
+                                    DenseTensor *dy,
+                                    DX_OP dx_op,
+                                    DY_OP dy_op) {
+  size_t N = static_cast<size_t>(phi::product(x_dim));
+  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
+      x.data<T>(),
+      y.data<T>(),
+      out.data<Tout>(),
+      dout.data<Tout>(),
+      dx_op,
+      dy_op,
+      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
+      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
+#if defined(__NVCC__) || defined(__HIPCC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
@@ -114,7 +463,6 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
   }
 }
 
-#ifndef __xpu__
 template <typename T, typename OP, typename Tout = T>
 static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             const T *y,
@@ -1282,13 +1630,13 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
   std::vector<int> x_dims_array(max_dim);
   std::vector<int> y_dims_array(max_dim);
   std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
   // for inplace strategy. memset will make dx and dout clear and get wrong
   // result.
   if (dx && dx->IsSharedBufferWith(dout)) {
@@ -1340,37 +1688,37 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      phi::errors::InvalidArgument(
+      errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    phi::errors::InvalidArgument(
+                    errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
 
   int pre, n, post, is_run_common_broadcast, axis_trim = 0;
   if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
     axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
   } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
     axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
   }
   // special case for common backward implementation.
   if (is_run_common_broadcast) {
@@ -1408,228 +1756,7 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   }
 }
 
-/*
-******************************
-    Add Grad
-******************************
-*/
-
-template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(
-    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
-  int stride = GRID_NUM_X * BLOCK_NUM_X;
-  int loop = size / vec_size;
-  int remainder = size % vec_size;
-  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
-  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
-  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
-  float4 tmp_loop;
-
-  for (int i = tid; i < loop; i += stride) {
-    tmp_loop = dout_vec[i];
-    dx_vec[i] = tmp_loop;
-    dy_vec[i] = tmp_loop;
-  }
-
-  if (tid == loop && remainder != 0) {
-    T tmp_rem;
-    while (remainder) {
-      int idx = size - remainder;
-      remainder--;
-      tmp_rem = dout[idx];
-      dx[idx] = tmp_rem;
-      dy[idx] = tmp_rem;
-    }
-  }
-}
-
-template <typename T>
-void default_elementwise_add_grad(const GPUContext &ctx,
-                                  const DenseTensor &x,
-                                  const DenseTensor &y,
-                                  const DenseTensor &out,
-                                  const DenseTensor &dout,
-                                  DenseTensor *dx,
-                                  DenseTensor *dy,
-                                  int axis = -1) {
-  auto *dout_data = dout.data<T>();
-
-  // dx
-  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout.dims()) {
-      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout.dims()) {
-      if (dy_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
-      }
-    } else {
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename T>
-void elementwise_add_grad(const GPUContext &ctx,
-                          const DenseTensor &x,
-                          const DenseTensor &y,
-                          const DenseTensor &out,
-                          const DenseTensor &dout,
-                          DenseTensor *dx,
-                          DenseTensor *dy) {
-  auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  auto *dout_data = dout.data<T>();
-  if (dx_data == dout_data && dy_data != dout_data) {
-    VLOG(4) << "Special case when dx_data is the same as dout_data, "
-               "only need copy dout to dy";
-    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
-  } else if (dx_data != dout_data && dy_data == dout_data) {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "only need copy dout to dx";
-    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-  } else if (dx_data != dout_data && dy_data != dout_data) {
-    auto size = x.numel();
-    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
-                 PREDEFINED_BLOCK_SIZE,
-             1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        dout.data<T>(),
-        size,
-        vec_size,
-        dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
-  } else {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "and dx_data is the same as dout_data, do not need "
-               "any operator";
-  }
-}
-
-/*
-******************************
-    Sub Grad
-******************************
-*/
-
-template <typename T>
-static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
-                                                       int64_t size,
-                                                       T *dx,
-                                                       T *dy) {
-  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
-
-  while (col < size) {
-    if (dx != nullptr) {
-      dx[col] = dout[col];
-    }
-    dy[col] = -dout[col];
-    col += BLOCK_NUM_X * GRID_NUM_X;
-  }
-}
-
-template <typename T>
-void default_elementwise_sub_grad(const GPUContext &ctx,
-                                  const DenseTensor &x,
-                                  const DenseTensor &y,
-                                  const DenseTensor &out,
-                                  const DenseTensor &dout,
-                                  DenseTensor *dx,
-                                  DenseTensor *dy,
-                                  int axis = -1) {
-  auto *dout_data = dout.data<T>();
-  // dx
-  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout.dims()) {
-      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout.dims()) {
-      if (dy_data != dout_data) {
-        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-        auto size = dy->numel();
-        dim3 grid_size =
-            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-        SimpleElemwiseSubGradCUDAKernel<
-            T><<<grid_size, block_size, 0, ctx.stream()>>>(
-            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
-          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename T>
-void elementwise_sub_grad(const GPUContext &ctx,
-                          const DenseTensor &x,
-                          const DenseTensor &y,
-                          const DenseTensor &out,
-                          const DenseTensor &dout,
-                          DenseTensor *dx,
-                          DenseTensor *dy) {
-  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-  auto size = x.numel();
-  dim3 grid_size =
-      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-  SimpleElemwiseSubGradCUDAKernel<
-      T><<<grid_size, block_size, 0, ctx.stream()>>>(
-      dout.data<T>(),
-      size,
-      dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
-}
-
 #endif
 
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h
new file mode 100644
index 00000000000..3790044346d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/elementwise_utils.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+
+namespace funcs {
+
+using DDim = phi::DDim;
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
+ *
+ * New parameter: *is_run_common_broadcast* is a flag to record whether to run
+ * common broadcast code.
+ */
+inline void GetMidDims(const DDim &x_dims,
+                       const DDim &y_dims,
+                       const int axis,
+                       int *pre,
+                       int *n,
+                       int *post,
+                       int *is_run_common_broadcast) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  *is_run_common_broadcast = 0;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    if (x_dims[i + axis] != y_dims[i]) {
+      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Broadcast dimension mismatch. Operands "
+                            "could not be broadcast together with the shape of "
+                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
+                            "in X is not equal to [%d] in Y.",
+                            x_dims,
+                            y_dims,
+                            x_dims[i + axis],
+                            y_dims[i]));
+      *is_run_common_broadcast = 1;
+      return;
+    }
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+inline DDim TrimTrailingSingularDims(const DDim &dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size == dims.size()) return dims;
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim(phi::make_dim());
+  }
+  DDim actual_dims = phi::make_ddim(trim_dims);
+  return actual_dims;
+}
+
+inline int GetElementwiseIndex(const int *x_dims_array,
+                               const int max_dim,
+                               const int *index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+inline void UpdateElementwiseIndexArray(const int *out_dims_array,
+                                        const int max_dim,
+                                        int *index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
new file mode 100644
index 00000000000..b17196b6b11
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+/*
+******************************
+    Add Grad
+******************************
+*/
+
+template <typename T>
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
+  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  int stride = GRID_NUM_X * BLOCK_NUM_X;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
+  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
+  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
+
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
+  }
+}
+
+template <typename T>
+void DefaultElementwiseAddGrad(const GPUContext &ctx,
+                               const DenseTensor &x,
+                               const DenseTensor &y,
+                               const DenseTensor &out,
+                               const DenseTensor &dout,
+                               DenseTensor *dx,
+                               DenseTensor *dy,
+                               int axis = -1) {
+  auto *dout_data = dout.data<T>();
+
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+}
+
+template <typename T>
+void ElementwiseAddGrad(const GPUContext &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy) {
+  ctx.template Alloc<T>(dx);
+  ctx.template Alloc<T>(dy);
+  auto *dx_data = dx->data<T>();
+  auto *dy_data = dy->data<T>();
+  auto *dout_data = dout.data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x.numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
+                 PREDEFINED_BLOCK_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        dout.data<T>(),
+        size,
+        vec_size,
+        dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
+}
+
+/*
+******************************
+    Sub Grad
+******************************
+*/
+
+template <typename T>
+static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
+                                                       int64_t size,
+                                                       T *dx,
+                                                       T *dy) {
+  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+
+  while (col < size) {
+    if (dx != nullptr) {
+      dx[col] = dout[col];
+    }
+    dy[col] = -dout[col];
+    col += BLOCK_NUM_X * GRID_NUM_X;
+  }
+}
+
+template <typename T>
+void default_elementwise_sub_grad(const GPUContext &ctx,
+                                  const DenseTensor &x,
+                                  const DenseTensor &y,
+                                  const DenseTensor &out,
+                                  const DenseTensor &dout,
+                                  DenseTensor *dx,
+                                  DenseTensor *dy,
+                                  int axis = -1) {
+  auto *dout_data = dout.data<T>();
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+        auto size = dy->numel();
+        dim3 grid_size =
+            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+        SimpleElemwiseSubGradCUDAKernel<
+            T><<<grid_size, block_size, 0, ctx.stream()>>>(
+            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+    }
+  }
+}
+
+template <typename T>
+void elementwise_sub_grad(const GPUContext &ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &y,
+                          const DenseTensor &out,
+                          const DenseTensor &dout,
+                          DenseTensor *dx,
+                          DenseTensor *dy) {
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+  auto size = x.numel();
+  dim3 grid_size =
+      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+  SimpleElemwiseSubGradCUDAKernel<
+      T><<<grid_size, block_size, 0, ctx.stream()>>>(
+      dout.data<T>(),
+      size,
+      dx->mutable_data<T>(ctx.GetPlace()),
+      dy->mutable_data<T>(ctx.GetPlace()));
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 3c4c01b1dc8..d00888aee67 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -17,8 +17,9 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
 namespace phi {
@@ -33,9 +34,9 @@ void AddGradFunc(const GPUContext& dev_ctx,
                  DenseTensor* dy,
                  int axis = -1) {
   if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
   } else {
-    default_elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
   }
 }
 
@@ -58,15 +59,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx,
-                            y,
-                            ddx,
-                            ddy,
-                            dout,
-                            axis,
-                            ddout,
-                            ElementwiseCompute<funcs::AddFunctor<T>, T>,
-                            ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -106,15 +99,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(
-      dev_ctx,
-      y,
-      ddx,
-      ddy,
-      dout,
-      axis,
-      ddout,
-      ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu
index f32d4c77d40..1c0bafc932e 100644
--- a/paddle/phi/kernels/gpu/logical_kernel.cu
+++ b/paddle/phi/kernels/gpu/logical_kernel.cu
@@ -16,9 +16,8 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/logical_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index fc73ccca6de..af9d5574aa9 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/math_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 
 #ifdef __NVCC__
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 460e74b5816..ac7d6fd1a0e 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace phi {
@@ -47,19 +47,14 @@ void AddGradImpl(const Context& dev_ctx,
   }
 }
 
-template <typename T,
-          typename Context,
-          typename GradFunc,
-          typename GradInverseFunc>
+template <typename T, typename Context>
 void AddDoubleGradImpl(const Context& dev_ctx,
                        const DenseTensor& y,
                        const paddle::optional<const DenseTensor&>& ddx,
                        const paddle::optional<const DenseTensor&>& ddy,
                        const DenseTensor& dout,
                        int axis,
-                       DenseTensor* ddout,
-                       GradFunc grad_func,
-                       GradInverseFunc grad_inverse_func) {
+                       DenseTensor* ddout) {
   // ddOut = ddx + ddy
   if (ddout) {
     DenseTensor ddx_safe, ddy_safe;
@@ -72,28 +67,28 @@ void AddDoubleGradImpl(const Context& dev_ctx,
     auto ddx_dims = ddx_safe.dims();
     auto ddy_dims = ddy_safe.dims();
     if (ddx_dims.size() >= ddy_dims.size()) {
-      grad_func(
+      funcs::ElementwiseCompute<funcs::AddFunctor<T>, T>(
           dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor<T>(), ddout);
     } else {
-      grad_inverse_func(dev_ctx,
-                        ddx_safe,
-                        ddy_safe,
-                        axis,
-                        funcs::InverseAddFunctor<T>(),
-                        ddout);
+      funcs::ElementwiseCompute<funcs::InverseAddFunctor<T>, T>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          axis,
+          funcs::InverseAddFunctor<T>(),
+          ddout);
     }
   }
 }
 
-template <typename T, typename Context, typename GradFunc>
+template <typename T, typename Context>
 void SubtractDoubleGradImpl(const Context& dev_ctx,
                             const DenseTensor& y,
                             const paddle::optional<const DenseTensor&>& ddx,
                             const paddle::optional<const DenseTensor&>& ddy,
                             const DenseTensor& dout,
                             int axis,
-                            DenseTensor* ddout,
-                            GradFunc grad_func) {
+                            DenseTensor* ddout) {
   // DDOut = ddx - ddy
   if (ddout) {
     DenseTensor ddx_safe, ddy_safe;
@@ -103,7 +98,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
         dev_ctx, y, ddy.get_ptr(), &ddy_safe);
 
     ddout->mutable_data<T>(dev_ctx.GetPlace());
-    grad_func(
+    funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
         dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
   }
 }
-- 
GitLab


From 2e6548a9cd2224e1a4b89c1351f1078273f98328 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 2 Mar 2022 18:40:00 +0800
Subject: [PATCH 064/272] vec scale kernel (#40011)

---
 .../optimizers/distributed_fused_lamb_op.cu   | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index ca0828a6f6a..8bb4606ffff 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -304,14 +304,30 @@ struct AndFunctor {
   HOSTDEVICE bool operator()(bool x, bool y) const { return x && y; }
 };
 
-template <typename T1, typename T2>
+template <typename T1, typename T2, int VecSize>
 static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
                                        const T2 *__restrict__ scale,
                                        T1 *__restrict__ y, int num) {
   static_assert(sizeof(T1) <= sizeof(T2),
                 "sizeof(T1) must be not greater than sizeof(T2).");
   T2 s = scale[0];
-  CUDA_KERNEL_LOOP(i, num) {
+
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = blockDim.x * gridDim.x * VecSize;
+
+  for (; i + VecSize <= num; i += stride) {
+    platform::AlignedVector<T1, VecSize> x_vec;
+    platform::AlignedVector<T1, VecSize> y_vec;
+
+    platform::Load(x + i, &x_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
+    }
+    platform::Store(y_vec, y + i);
+  }
+
+  for (; i < num; ++i) {
     y[i] = static_cast<T1>(static_cast<T2>(x[i]) * s);
   }
 }
@@ -396,7 +412,6 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
   for (; i + VecSize <= num; i += stride) {
     platform::AlignedVector<T, VecSize> param_vec;
     platform::AlignedVector<GradT, VecSize> grad_vec;
-    platform::AlignedVector<T, VecSize> weight_decay_vec;
     platform::AlignedVector<T, VecSize> mom1_vec;
     platform::AlignedVector<T, VecSize> mom2_vec;
     platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
@@ -760,6 +775,24 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype,
   return false;
 }
 
+template <typename T1, typename T2>
+static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
+                              const T1 *x, const T2 *scale, T1 *y, int n,
+                              gpuStream_t stream) {
+  int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0));
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
+
+#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                          \
+  do {                                                                         \
+    ScaleCUDAKernel<T1, T2, kVecSize><<<config.block_per_grid,                 \
+                                        config.thread_per_block, 0, stream>>>( \
+        x, scale, y, n);                                                       \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE);
+#undef PD_LAMB_VEC_SCALE_KERNEL_CASE
+}
+
 template <typename T>
 static void NCCLReduceScatterWithScale(
     const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks,
@@ -775,10 +808,8 @@ static void NCCLReduceScatterWithScale(
       PADDLE_ENFORCE_EQ(nranks, 1,
                         platform::errors::InvalidArgument(
                             "nranks must be 1 when scale != nullptr."));
-      auto numel = recvcount * nranks;
-      auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel);
-      ScaleCUDAKernel<<<config.block_per_grid, config.thread_per_block, 0,
-                        stream>>>(sendbuff, scale, recvbuff, numel);
+      LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks,
+                        stream);
     }
     return;
   }
@@ -792,9 +823,7 @@ static void NCCLReduceScatterWithScale(
   if (scale && !should_destroy_op) {
     size_t numel = recvcount * nranks;
     T *new_sendbuff = buffer.Alloc<T>(numel);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel);
-    ScaleCUDAKernel<<<config.block_per_grid, config.thread_per_block, 0,
-                      stream>>>(sendbuff, scale, new_sendbuff, numel);
+    LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
     sendbuff = new_sendbuff;
   }
 
-- 
GitLab


From 09258040e2584f4afd9114b994710232e6769970 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 2 Mar 2022 18:50:26 +0800
Subject: [PATCH 065/272] Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to
 the phi library (#40043)

* move gather.h gather.cu.h scatter.h scatter.cu.h to phi library

* fix CI

* fix rocm ci
---
 .../fluid/operators/detection/bbox_util.cu.h  |   1 -
 .../detection/collect_fpn_proposals_op.cu     |  10 +-
 .../detection/collect_fpn_proposals_op.h      |   6 +-
 .../detection/distribute_fpn_proposals_op.cu  |   5 +-
 .../detection/distribute_fpn_proposals_op.h   |  15 +-
 .../detection/generate_mask_labels_op.cc      |   1 -
 .../detection/generate_proposal_labels_op.cc  |  16 +-
 .../detection/generate_proposals_op.cc        |  18 +-
 .../detection/generate_proposals_op.cu        |   9 +-
 .../detection/generate_proposals_v2_op.cc     |  18 +-
 .../detection/generate_proposals_v2_op.cu     |   9 +-
 paddle/fluid/operators/gather_nd_op.cu        |  94 +++++-----
 paddle/fluid/operators/gather_nd_op.h         |  66 ++++---
 paddle/fluid/operators/gather_op.cu           |  32 ++--
 paddle/fluid/operators/gather_op.h            |  68 +++----
 paddle/fluid/operators/gather_test.cc         |   4 +-
 paddle/fluid/operators/grid_sampler_op.h      |   1 -
 .../fluid/operators/math/segment_pooling.cu   |   6 +-
 paddle/fluid/operators/scatter_nd_add_op.cu   |  41 ++--
 paddle/fluid/operators/scatter_nd_add_op.h    |  41 ++--
 paddle/fluid/operators/scatter_op.cu          |  50 +++--
 paddle/fluid/operators/scatter_op.h           |  63 +++----
 paddle/fluid/operators/scatter_test.cc        |   4 +-
 paddle/fluid/operators/segment_pool_op.cu     |   1 -
 .../sequence_ops/sequence_scatter_op.cc       |   2 -
 .../sequence_ops/sequence_scatter_op.h        |   3 +-
 paddle/fluid/operators/viterbi_decode_op.cu   |  38 ++--
 paddle/fluid/operators/viterbi_decode_op.h    | 128 +++++++------
 .../kernels/funcs}/gather.cu.h                | 176 +++++++++++-------
 .../operators => phi/kernels/funcs}/gather.h  | 114 +++++++-----
 .../kernels/funcs}/scatter.cu.h               | 124 ++++++------
 .../operators => phi/kernels/funcs}/scatter.h | 165 ++++++++--------
 32 files changed, 702 insertions(+), 627 deletions(-)
 rename paddle/{fluid/operators => phi/kernels/funcs}/gather.cu.h (62%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/gather.h (72%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.cu.h (67%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.h (65%)

diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index b361bc3ab75..f170fbbe4b5 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index ce9ac3de4e7..860fdd01794 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -23,11 +23,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
     Tensor sorted_batch_id;
     sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
-    GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
-                   &sorted_batch_id);
+    phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
+    phi::funcs::GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
+                               &sorted_batch_id);
 
     Tensor batch_index_t;
     int* batch_idx_in =
@@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
         out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
         sizeof(int) * 8, dev_ctx.stream());
 
-    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
+    phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
     Tensor length_lod;
     int* length_lod_data =
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index a60f881ebf3..e5ae9a6ccbd 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -21,7 +21,6 @@ limitations under the License.*/
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
-    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiInput<framework::Tensor>("MultiLevelRoIsNum");
     int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
@@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
     num_per_batch.emplace_back(post_nms_topN - pre_idx);
     if (context.HasOutput("RoisNum")) {
-      auto* rois_num = context.Output<Tensor>("RoisNum");
+      auto* rois_num = context.Output<framework::Tensor>("RoisNum");
       int* rois_num_data =
           rois_num->mutable_data<int>({batch_size}, context.GetPlace());
       for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index c117fbd70f5..7ad25e003b4 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -24,9 +24,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         start = end;
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
-        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+        phi::funcs::GPUGather<T>(dev_ctx, *fpn_rois, sub_idx,
+                                 multi_fpn_rois[i]);
       } else {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 628cbcd7611..5479e08c2a5 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -28,10 +27,11 @@ namespace operators {
 
 const int kBoxDim = 4;
 
-inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+inline std::vector<size_t> GetLodFromRoisNum(
+    const framework::Tensor* rois_num) {
   std::vector<size_t> rois_lod;
   auto* rois_num_data = rois_num->data<int>();
-  Tensor cpu_tensor;
+  framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(rois_num->place())) {
     paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(),
                                       &cpu_tensor);
@@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<size_t> fpn_rois_lod;
     int fpn_rois_num;
     if (context.HasInput("RoisNum")) {
-      auto* rois_num = context.Input<Tensor>("RoisNum");
+      auto* rois_num = context.Input<framework::Tensor>("RoisNum");
       fpn_rois_lod = GetLodFromRoisNum(rois_num);
     } else {
       fpn_rois_lod = fpn_rois->lod().back();
@@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> num_rois_level(num_level, 0);
     std::vector<int> num_rois_level_integral(num_level + 1, 0);
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
@@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> restore_index_inter(fpn_rois_num, -1);
     // distribute the rois into different fpn level by target level
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       size_t cur_offset = fpn_rois_lod[i];
@@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
-    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiOutput<framework::Tensor>("MultiLevelRoIsNum");
     if (multi_rois_num.size() > 0) {
       int batch_size = fpn_rois_lod.size() - 1;
       for (int i = 0; i < num_level; ++i) {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index e6af1a5bbf7..c9cc4e72207 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 424aa071440..cbf17048400 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
   Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
   fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
   bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
   Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
+  phi::funcs::CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
   fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
+  phi::funcs::CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
   phi::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
   Tensor fg_max_overlap, bg_max_overlap;
   fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
   bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
   Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
 }
 
@@ -334,7 +334,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     } else {
       proposals_num = keep.numel();
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      CPUGather<T>(context, rpn_rois, keep, &roi_filter);
+      phi::funcs::CPUGather<T>(context, rpn_rois, keep, &roi_filter);
     }
     T* roi_filter_dt = roi_filter.data<T>();
     memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 8c4bd4ac613..d6130823271 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 6e3c322c174..5fb7973fd89 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -85,8 +86,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -102,8 +103,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 6351ea865cd..1f1802574c5 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 93ba3deca5f..005309e8ee5 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -86,8 +87,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -104,8 +105,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
index 0de2798bf75..338c4411618 100644
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ b/paddle/fluid/operators/gather_nd_op.cu
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -33,27 +33,25 @@ class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
+    const auto &index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGatherNd<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -71,24 +69,22 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    const auto &index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
 
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterNdAdd<T, int>(dev_ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
     }
   }
 };
@@ -98,18 +94,16 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int16_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, bool>,
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<float>,
+                        ops::GatherNdOpCUDAKernel<double>,
+                        ops::GatherNdOpCUDAKernel<int64_t>,
+                        ops::GatherNdOpCUDAKernel<int>,
+                        ops::GatherNdOpCUDAKernel<int16_t>,
+                        ops::GatherNdOpCUDAKernel<bool>,
+                        ops::GatherNdOpCUDAKernel<plat::float16>);
 
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel<float>,
+                        ops::GatherNdGradOpCUDAKernel<double>,
+                        ops::GatherNdGradOpCUDAKernel<int64_t>,
+                        ops::GatherNdGradOpCUDAKernel<int>,
+                        ops::GatherNdGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
index f458c0e1801..d54261008e4 100644
--- a/paddle/fluid/operators/gather_nd_op.h
+++ b/paddle/fluid/operators/gather_nd_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -38,22 +38,20 @@ class GatherNdOpKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
+    auto index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s]",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGatherNd<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -65,6 +63,7 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(ctx.GetPlace()), true,
         platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+
     auto *index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -75,22 +74,21 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
+    auto index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s]",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
     }
   }
 };
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index a502a130409..8f1d9284c50 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -49,11 +49,14 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     }
     const auto &place = ctx.GetPlace();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &dev_ctx = ctx.cuda_device_context();
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2CUDAFunction<T, int32_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
+                                                     dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
+                                                     dev_ctx);
       }
       return;
     }
@@ -61,9 +64,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -93,14 +96,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 
+    const auto &dev_ctx = ctx.cuda_device_context();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       }
       return;
     }
@@ -112,11 +116,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
+                                           ctx.Attr<bool>("overwrite"));
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
+                                               ctx.Attr<bool>("overwrite"));
     }
   }
 };
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 016c2b398da..94de694b2f9 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel<T> {
     // get axis from tensor
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2Function<T, int32_t>(x, index, axis, output, place);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2Function<T, int64_t>(x, index, axis, output, place);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
+                                                 output);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
+                                                 output);
       }
       return;
     }
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradFunction<T, int32_t>(dO, index, axis, dX, ctx.GetPlace());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradFunction<T, int64_t>(dO, index, axis, dX, ctx.GetPlace());
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
+                                                     dX);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
+                                                     dX);
       }
       return;
     }
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto &place = *dev_ctx.eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     bool overwrite = ctx.Attr<bool>("overwrite");
 
-    if (index_type == framework::proto::VarType::INT32) {
+    if (index_type == phi::DataType::INT32) {
       if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
       }
-    } else if (index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == phi::DataType::INT64) {
       if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
       }
     }
   }
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 0f3dcdadcf8..c962dd06523 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 
 TEST(Gather, GatherData) {
   paddle::framework::Tensor* src = new paddle::framework::Tensor();
@@ -39,7 +39,7 @@ TEST(Gather, GatherData) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  phi::funcs::CPUGather<int>(ctx, *src, *index, output);
   delete cpu_place;
   cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 8f3c6660f51..93e96694270 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index bb6d8756bd0..fbdcb99c02a 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -379,9 +379,9 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
       SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                      context.stream()>>>(mean_grad.data<T>(),
                                          summed_ids->data<T>(), len, dim);
-      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
index 6448f8cc405..2fe3fcb759d 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ b/paddle/fluid/operators/scatter_nd_add_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_nd_add_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -33,22 +33,20 @@ class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
     auto *Out = ctx.Output<Tensor>("Out");
 
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int32_t>(ctx, *Updates, *Ids, Out);
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
     } else {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
     }
   }
 };
@@ -69,12 +67,13 @@ class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
     }
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
+      auto &dev_ctx = ctx.cuda_device_context();
       // Gradient by Gather
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGatherNd<DeviceContext, T, int32_t>(ctx, *dOut, *Ids, dUpdates);
+      const auto &index_type = Ids->dtype();
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        GPUGatherNd<DeviceContext, T, int64_t>(ctx, *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
index 2bdf9ec58a8..81c95fe55ab 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ b/paddle/fluid/operators/scatter_nd_add_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -37,23 +37,21 @@ class ScatterNdAddOpKernel : public framework::OpKernel<T> {
 
     // In place output: Out = X
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
     } else {
-      ScatterNdAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
     }
   }
 };
@@ -76,11 +74,12 @@ class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGatherNd<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      const auto &index_type = Ids->dtype();
+      auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        CPUGatherNd<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index 549e30803b4..7755e376bc1 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,23 +35,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // use template class to support int32_t and int64_t
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    auto index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op Index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s].",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int32_t>(ctx, *Updates, *Ids, Out, overwrite);
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
     } else {
-      GPUScatterAssign<T, int64_t>(ctx, *Updates, *Ids, Out, overwrite);
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
     }
   }
 };
@@ -68,36 +67,33 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    auto index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
+    auto &dev_ctx = ctx.cuda_device_context();
     if (dX) {
       framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
       } else {
-        GPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::GPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
       }
     }
 
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        GPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 69ab6c7135c..7733181a93f 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -39,29 +39,27 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = X, Out[Ids] = Updates
     framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
       } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
       }
     } else {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
       }
     }
   }
@@ -79,36 +77,33 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (dX) {
       framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
       } else {
-        CPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::CPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
       }
     }
 
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        CPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index 0a4cab5fac1..93f2d60e5f2 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 #include <gtest/gtest.h>
 
@@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
+  phi::funcs::ScatterAssign<float>(ctx, src, index, &output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
index 4e20844dc32..e147e62a983 100644
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 2d4730635fd..25c12ab565a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
index 365381abc46..2960b77d5ac 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index 3c546dd8156..68628fb2748 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -62,10 +62,11 @@ int64_t ComputeBlockSize(int64_t col) {
 
 template <template <typename T> typename BinaryFunctor, typename T>
 struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
-    std::vector<const Tensor*> ins{&lhs, &rhs};
-    std::vector<Tensor*> outs{output};
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* output) {
+    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
+    std::vector<framework::Tensor*> outs{output};
     paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
                                                    T>(dev_ctx, ins, &outs, -1,
                                                       BinaryFunctor<T>());
@@ -75,10 +76,11 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
 template <template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
-    std::vector<const Tensor*> ins = {&lhs, &rhs};
-    std::vector<Tensor*> outs = {mask};
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
+    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
+    std::vector<framework::Tensor*> outs = {mask};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
         dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
@@ -131,8 +133,9 @@ struct ARange<platform::CUDADeviceContext> {
 
 template <typename T, typename IndType>
 struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
     framework::DDim input_dims = input.dims();
     int64_t numel = input.numel();
     int64_t groups = numel / input_dims[axis];
@@ -166,8 +169,8 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
 template <typename T>
 struct GetMaxValue<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const Tensor& input, T* max_value) {
-    Tensor out_data;
+                  const framework::Tensor& input, T* max_value) {
+    framework::Tensor out_data;
     out_data.Resize(phi::make_ddim({1}));
     out_data.mutable_data<T>(platform::CUDAPlace());
     switch (ComputeBlockSize(input.numel())) {
@@ -177,7 +180,7 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
               1, input.numel(), 1, input.data<int64_t>(), nullptr,
               out_data.data<int64_t>()));
     }
-    Tensor max_value_tensor;
+    framework::Tensor max_value_tensor;
     framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
     *max_value = max_value_tensor.data<T>()[0];
   }
@@ -185,9 +188,10 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
 
 template <typename T, typename IndexT>
 struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    GPUGather<T, IndexT>(ctx, src, index, output);
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& src, const framework::Tensor& index,
+                  framework::Tensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
   }
 };
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index bf12a03e7b4..0974177e6c7 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 #endif
@@ -28,12 +28,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = framework::LoDTensor;
-
 template <typename DeviceContext, typename T, typename IndType>
 struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
     framework::DDim input_dims = input.dims();
     int64_t pre = 1;
     int64_t post = 1;
@@ -82,7 +81,7 @@ struct ARange {
 
 template <typename DeviceContext, typename T>
 struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& input,
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
                   T* max_value) {
     auto input_ptr = input.data<T>();
     auto num = input.numel();
@@ -92,14 +91,15 @@ struct GetMaxValue {
 
 template <typename DeviceContext, typename T, typename IndexT = int>
 struct Gather {
-  void operator()(const DeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    CPUGather<T, IndexT>(ctx, src, index, output);
+  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
+                  const framework::Tensor& index, framework::Tensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
   }
 };
 
 template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
+void SameDimsBinaryOP(const framework::Tensor& lhs,
+                      const framework::Tensor& rhs, framework::Tensor* out) {
   const T* lhs_ptr = lhs.data<T>();
   const T* rhs_ptr = rhs.data<T>();
   OutT* out_ptr = out->data<OutT>();
@@ -116,8 +116,9 @@ template <typename DeviceContext,
           template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
     SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
   }
 };
@@ -161,8 +162,9 @@ struct GetInputIndex<false> {
 };
 
 template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
-                             Tensor* out) {
+void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
+                             const framework::Tensor& rhs,
+                             framework::Tensor* out) {
   const T* lhs_ptr = lhs.data<T>();
   const T* rhs_ptr = rhs.data<T>();
   T* out_ptr = out->data<T>();
@@ -200,8 +202,8 @@ void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
 template <typename DeviceContext, template <typename T> typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
+                  const framework::Tensor& rhs, framework::Tensor* output) {
     if (lhs.dims() == rhs.dims()) {
       SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
     } else {
@@ -222,20 +224,21 @@ struct BinaryOperation {
 
 class TensorBuffer {
  public:
-  explicit TensorBuffer(const LoDTensor& in) : buffer_(in), offset_(0) {
+  explicit TensorBuffer(const framework::LoDTensor& in)
+      : buffer_(in), offset_(0) {
     buffer_.Resize({buffer_.numel()});
   }
-  Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
     int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                    std::multiplies<int64_t>());
-    Tensor block = buffer_.Slice(offset_, offset_ + size);
+    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
     offset_ += size;
     block.Resize(shape);
     return block;
   }
 
  private:
-  LoDTensor buffer_;  // need to resize 1-D Tensor
+  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
   int offset_;
 };
 
@@ -246,17 +249,17 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<framework::Tensor>("Input");
     auto batch_size = static_cast<int>(input->dims()[0]);
     auto seq_len = static_cast<int>(input->dims()[1]);
     auto n_labels = static_cast<int>(input->dims()[2]);
     phi::funcs::SetConstant<DeviceContext, T> float_functor;
     phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<Tensor> historys;
+    std::vector<framework::Tensor> historys;
     // We create tensor buffer in order to avoid allocating memory frequently
     // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
     int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    LoDTensor int_buffer;
+    framework::LoDTensor int_buffer;
     int_buffer.Resize(phi::make_ddim({buffer_size}));
     int_buffer.mutable_data<int64_t>(ctx.GetPlace());
     TensorBuffer int_tensor_buffer(int_buffer);
@@ -264,64 +267,78 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
     buffer_size = batch_size * (seq_len + 10) * n_labels +
                   (batch_size + 2) * n_labels * n_labels;
-    LoDTensor float_buffer;
+    framework::LoDTensor float_buffer;
     float_buffer.Resize(phi::make_ddim({buffer_size}));
     float_buffer.mutable_data<T>(ctx.GetPlace());
     TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<Tensor>("Length");
-    Tensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    auto* length = ctx.Input<framework::Tensor>("Length");
+    framework::Tensor left_length =
+        int_tensor_buffer.GetBufferBlock({batch_size, 1});
     framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
     int64_t max_seq_len = 0;
     GetMaxValue<DeviceContext, int64_t> get_max_value;
     get_max_value(dev_ctx, left_length, &max_seq_len);
 
-    auto* scores = ctx.Output<Tensor>("Scores");
+    auto* scores = ctx.Output<framework::Tensor>("Scores");
     scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<Tensor>("Path");
+    auto* path = ctx.Output<framework::Tensor>("Path");
     path->Resize({batch_size, max_seq_len});
     path->mutable_data<int64_t>(curr_place);
-    Tensor tpath = int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+    framework::Tensor tpath =
+        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
     auto batch_path = Unbind(tpath);
     for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
       it->Resize({batch_size});
     }
     // create and init required tensor
-    Tensor input_exp =
+    framework::Tensor input_exp =
         float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
     TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<Tensor>("Transition");
-    Tensor trans_exp = float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+    auto* transition = ctx.Input<framework::Tensor>("Transition");
+    framework::Tensor trans_exp =
+        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
     framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
     trans_exp.Resize({1, n_labels, n_labels});
-    Tensor alpha = float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor alpha =
+        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
     int_functor(dev_ctx, &zero, 0);
-    Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
     int_functor(dev_ctx, &one, 1);
-    Tensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor float_one =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
     float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    Tensor alpha_trn_sum =
+    framework::Tensor alpha_trn_sum =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    Tensor alpha_max =
+    framework::Tensor alpha_max =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor alpha_argmax =
+    framework::Tensor alpha_argmax =
         int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
     auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    Tensor alpha_nxt =
+    framework::Tensor alpha_nxt =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    Tensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor start_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor rest_trans =
+    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor zero_len_mask =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor float_mask =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor stop_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor start_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor rest_trans =
         float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const Tensor*> shape{&rest_trans, &stop_trans, &start_trans};
-    std::vector<Tensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor last_ids_tmp =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor batch_offset =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor gather_idx =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
+                                                &start_trans};
+    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
+                                            &start_trans};
     math::SplitFunctor<DeviceContext, T> split_functor;
     split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
     stop_trans.Resize({1, n_labels});
@@ -346,9 +363,9 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     SubInt(dev_ctx, left_length, one, &left_length);
     Argmax<DeviceContext, T, int64_t> argmax;
     for (int64_t i = 1; i < max_seq_len; ++i) {
-      Tensor logit = input_exp.Slice(i, i + 1);
+      framework::Tensor logit = input_exp.Slice(i, i + 1);
       logit.Resize({batch_size, n_labels});
-      Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
       AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
       auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
       alpha_argmax_temp.Resize({batch_size, n_labels});
@@ -395,7 +412,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
       ++last_ids_index;
       AddInt(dev_ctx, left_length, one, &left_length);
       AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      Tensor& last_ids_update = batch_path[actual_len - last_ids_index];
+      framework::Tensor& last_ids_update =
+          batch_path[actual_len - last_ids_index];
       hist->Resize({batch_size * n_labels});
       gather(dev_ctx, *hist, gather_idx, &last_ids_update);
       GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
similarity index 62%
rename from paddle/fluid/operators/gather.cu.h
rename to paddle/phi/kernels/funcs/gather.cu.h
index fef425c53ac..6e31ab7f8c7 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -13,24 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/memory/memcpy.h"
+// TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/utils/dim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
 
-using framework::Tensor;
-using platform::DeviceContext;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
+__global__ void GatherCUDAKernel(const T* params,
+                                 const IndexT* indices,
+                                 T* output,
+                                 size_t index_size,
                                  size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -42,9 +43,12 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
-                                   const IndexT* indices, T* output,
-                                   size_t remain_size, size_t slice_size,
+__global__ void GatherNdCUDAKernel(const T* input,
+                                   const int64_t* input_dims,
+                                   const IndexT* indices,
+                                   T* output,
+                                   size_t remain_size,
+                                   size_t slice_size,
                                    size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -59,7 +63,8 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater than or equal to 0, but received [%d]",
-          input_dims[j], index_value);
+          input_dims[j],
+          index_value);
       gather_i += (index_value * temp);
       temp *= input_dims[j];
     }
@@ -76,13 +81,16 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
+void GPUGather(const phi::GPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "If the index's rank of gather_op is 2,"
-                          " the second dimension should be 1."));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("If the index's rank of gather_op is 2,"
+                                     " the second dimension should be 1."));
   }
 
   // index size
@@ -90,7 +98,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   if (index_size == 0) return;
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -105,18 +113,17 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   int64_t n = slice_size * index_size;
   int64_t grid = (n + block - 1) / block;
 
-  GatherCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
-  const auto& ctx = context.template device_context<DeviceContext>();
+template <typename T, typename IndexT = int>
+void GPUGatherNd(const phi::GPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -143,29 +150,36 @@ void GPUGatherNd(const framework::ExecutionContext& context,
     v_input_dims[i] = input_dims[i];
   }
 
-  auto& dev_ctx = context.cuda_device_context();
+  phi::DenseTensor input_dims_tensor;
+  input_dims_tensor.Resize({input_dims_size});
+  auto* g_input_dims = ctx.Alloc<int64_t>(&input_dims_tensor);
   int64_t bytes = input_dims_size * sizeof(int64_t);
-  auto p_input_dims = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_input_dims = reinterpret_cast<int64_t*>(p_input_dims->ptr());
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
-               ctx.stream());
+
+  paddle::memory::Copy(
+      gplace, g_input_dims, cplace, v_input_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  GatherNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
-      end_size);
+  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
+                                                                  g_input_dims,
+                                                                  p_index,
+                                                                  p_output,
+                                                                  remain_numel,
+                                                                  slice_size,
+                                                                  end_size);
 }
 
 template <typename T, typename U>
-__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
-                                int64_t outer_dim_size, int64_t inner_dim_size,
+__global__ void GatherGPUKernel(const T* input,
+                                const U* index,
+                                T* out,
+                                int64_t outer_dim_size,
+                                int64_t inner_dim_size,
                                 int64_t out_index_dim_size,
-                                int64_t input_index_dim_size, int64_t size) {
+                                int64_t input_index_dim_size,
+                                int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   int64_t outer_size = outer_dim_size * out_index_dim_size;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
@@ -180,7 +194,8 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
         "please check whether the dimensions of index and "
         "input meet the requirements. It should "
         "be less than [%d] and greater than or equal to 0, but received [%d]",
-        input_index_dim_size, index_val);
+        input_index_dim_size,
+        index_val);
 
     int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
     int64_t input_index =
@@ -191,11 +206,14 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+__global__ void GatherGradGPUKernel(const T* input,
+                                    const U* index,
+                                    T* out,
                                     int64_t outer_dim_size,
                                     int64_t inner_dim_size,
                                     int64_t input_index_dim_size,
-                                    int64_t out_index_dim_size, int64_t size) {
+                                    int64_t out_index_dim_size,
+                                    int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
     int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
@@ -210,10 +228,11 @@ __global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place,
-                          const framework::ExecutionContext& ctx) {
+void GatherV2CUDAFunction(const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out,
+                          const phi::GPUContext& ctx) {
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
   auto input_dim = input->dims();
@@ -241,24 +260,31 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
   int64_t out_size = out->numel();
   if (out_size == 0) return;
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
+  auto stream = ctx.stream();
   GatherGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      index_size, index_dim_size, out_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      index_size,
+      index_dim_size,
+      out_size);
 }
 
 template <typename T, typename U>
-void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
-                              const int axis, Tensor* out,
-                              const paddle::platform::Place& place,
-                              const framework::ExecutionContext& ctx) {
+void GatherV2GradCUDAFunction(const DenseTensor* input,
+                              const DenseTensor* index,
+                              const int axis,
+                              DenseTensor* out,
+                              const phi::GPUContext& ctx) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -279,19 +305,25 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
+  auto stream = ctx.stream();
   GatherGradGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      input_index_dim_size, out_index_dim_size, input_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      input_index_dim_size,
+      out_index_dim_size,
+      input_size);
 }
-}  // namespace operators
-}  // namespace paddle
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/gather.h b/paddle/phi/kernels/funcs/gather.h
similarity index 72%
rename from paddle/fluid/operators/gather.h
rename to paddle/phi/kernels/funcs/gather.h
index 46f78b16ef3..740042c999a 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -17,16 +17,13 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
  * A thin wrapper for gathering on cpu tensor
@@ -36,22 +33,23 @@ using framework::Tensor;
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
+void CPUGather(const phi::CPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
     PADDLE_ENFORCE_EQ(
-        index.dims()[1], 1,
-        platform::errors::InvalidArgument(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument(
             "index.dims()[1] should be 1 when index.dims().size() = 2"
             "in gather_op, but received value is [%d].",
             index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in gather_op,"
                           "but received shape's size is [%d].",
                           index.dims().size()));
@@ -74,29 +72,32 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
 
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
-    PADDLE_ENFORCE_LT(p_index[i], input_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(p_index[i],
+                      input_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_size, p_index[i], i));
-    PADDLE_ENFORCE_GE(p_index[i], 0,
-                      platform::errors::OutOfRange(
+                          input_size,
+                          p_index[i],
+                          i));
+    PADDLE_ENFORCE_GE(p_index[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          p_index[i], i));
+                          p_index[i],
+                          i));
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
-                 const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
-
+void CPUGatherNd(const phi::CPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
   auto input_dims = input.dims();
@@ -124,25 +125,30 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_LT(
-          index_value, input_dims[j],
-          platform::errors::InvalidArgument(
+          index_value,
+          input_dims[j],
+          phi::errors::InvalidArgument(
               "Input(index[-1)] has wrong value, it is [%d]", index_value));
       PADDLE_ENFORCE_GE(
-          index_value, 0,
-          platform::errors::InvalidArgument(
+          index_value,
+          0,
+          phi::errors::InvalidArgument(
               "The value of Input(index) must be no less than 0"));
 
       index_ += (index_value * temp);
       temp *= input_dims[j];
     }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
-           slice_bytes);
+    memcpy(
+        p_output + i * slice_size, p_input + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename U>
-void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
-                      Tensor* out, const paddle::platform::Place& place) {
+void GatherV2Function(const phi::CPUContext& ctx,
+                      const DenseTensor* input,
+                      const DenseTensor* index,
+                      int axis,
+                      DenseTensor* out) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -154,18 +160,23 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 
   int64_t input_index_dim_size = input_dim[axis_index];
   for (int64_t i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_data[i],
+                      input_index_dim_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_index_dim_size, index_data[i], i));
-    PADDLE_ENFORCE_GE(index_data[i], 0,
-                      platform::errors::OutOfRange(
+                          input_index_dim_size,
+                          index_data[i],
+                          i));
+    PADDLE_ENFORCE_GE(index_data[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          index_data[i], i));
+                          index_data[i],
+                          i));
   }
 
   int64_t inner_dim_size = 1;
@@ -184,7 +195,7 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
 
   int out_index = 0;
   for (int64_t i = 0; i < inner_dim_size; i++) {
@@ -200,9 +211,11 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 }
 
 template <typename T, typename U>
-void GatherV2GradFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place) {
+void GatherV2GradFunction(const phi::CPUContext& ctx,
+                          const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out) {
   auto* index_data = index->data<U>();
 
   auto input_dim = input->dims();
@@ -222,11 +235,10 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
   for (int64_t i = 0; i < inner_dim_size; i++) {
     for (int64_t j = 0; j < input_index_dim_size; j++) {
@@ -239,5 +251,5 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
similarity index 67%
rename from paddle/fluid/operators/scatter.cu.h
rename to paddle/phi/kernels/funcs/scatter.cu.h
index 2fea08516d3..f87e8c882c4 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -15,20 +15,19 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
-                                      size_t index_size, size_t slice_size) {
+__global__ void ScatterInitCUDAKernel(const IndexT* indices,
+                                      T* output,
+                                      size_t index_size,
+                                      size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -47,9 +46,12 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
-                                  T* output, size_t index_size,
-                                  size_t slice_size, bool overwrite) {
+__global__ void ScatterCUDAKernel(const T* params,
+                                  const IndexT* indices,
+                                  T* output,
+                                  size_t index_size,
+                                  size_t slice_size,
+                                  bool overwrite) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -72,9 +74,12 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
-                                    T* output, const int64_t* output_dims,
-                                    size_t remain_size, size_t slice_size,
+__global__ void ScatterNdCUDAKernel(const T* update,
+                                    const IndexT* indices,
+                                    T* output,
+                                    const int64_t* output_dims,
+                                    size_t remain_size,
+                                    size_t slice_size,
                                     size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -90,7 +95,8 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater or equal to 0, but received [%d]",
-          output_dims[j], index_value);
+          output_dims[j],
+          index_value);
 
       gather_i += (index_value * temp);
       temp *= output_dims[j];
@@ -109,21 +115,24 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
-                      const Tensor& src, const Tensor& index, Tensor* output,
+void GPUScatterAssign(const phi::GPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output,
                       bool overwrite = true) {
   // check index of shape 1-D
-  const auto& ctx = context.device_context();
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() = 2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() = 2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -131,7 +140,7 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -150,23 +159,20 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
 
   // if not overwrite mode, init data
   if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+    ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_index, p_output, index_size, slice_size);
   }
 
-  ScatterCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size, overwrite);
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void GPUScatterGradForX(const phi::GPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   // slice size
@@ -181,21 +187,18 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
   int64_t n = slice_size * index_size;
   int64_t height = (n + block - 1) / block;
 
-  int64_t max_grid_dimx =
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()[0];
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
 
-  ScatterInitCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
-                     const Tensor& update, const Tensor& index,
-                     Tensor* output) {
+template <typename T, typename IndexT = int>
+void GPUScatterNdAdd(const phi::GPUContext& ctx,
+                     const DenseTensor& update,
+                     const DenseTensor& index,
+                     DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
 
@@ -219,31 +222,34 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context,
   const size_t slice_bytes = slice_size * sizeof(T);
   // put output_dims int CUDA
   // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   std::vector<int64_t> v_output_dims(output_dims_size);
   for (int i = 0; i < output_dims_size; ++i) {
     v_output_dims[i] = output_dims[i];
   }
-  auto& dev_ctx = context.cuda_device_context();
+
+  phi::DenseTensor out_dims_tensor;
+  out_dims_tensor.Resize({output_dims_size});
+  auto* g_output_dims = ctx.Alloc<int64_t>(&out_dims_tensor);
   int64_t bytes = output_dims_size * sizeof(int64_t);
-  auto output_dims_ptr = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_output_dims = reinterpret_cast<int64_t*>(output_dims_ptr->ptr());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
+  paddle::memory::Copy(
+      gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  ScatterNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
+  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+      p_update,
+      p_index,
+      p_output,
+      g_output_dims,
+      remain_numel,
+      slice_size,
       end_size);
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/scatter.h b/paddle/phi/kernels/funcs/scatter.h
similarity index 65%
rename from paddle/fluid/operators/scatter.h
rename to paddle/phi/kernels/funcs/scatter.h
index eae82fcd01b..5d15c955a7f 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -15,18 +15,16 @@ limitations under the License. */
 #pragma once
 #include <cstring>
 #include <string>
+#include <unordered_set>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "unordered_set"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
   * Return the updated array pointer, use blas or eigen lib to optimize time
@@ -34,24 +32,31 @@ using Tensor = framework::Tensor;
  */
 template <typename T, typename IndexT = int>
 typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+  blas.VADD(slice_size,
+            src_pointer + src_index * slice_size,
             dst_pointer + dst_index * slice_size,
             dst_pointer + dst_index * slice_size);
 }
 
 template <typename T, typename IndexT = int>
 typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  using EigenVector = typename framework::EigenTensor<T, 1>::Type;
-  using ConstEigenVector = typename framework::EigenTensor<T, 1>::ConstType;
-
-  framework::EigenDim<1>::Type dim;
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  using EigenVector = typename phi::EigenTensor<T, 1>::Type;
+  using ConstEigenVector = typename phi::EigenTensor<T, 1>::ConstType;
+
+  phi::EigenDim<1>::Type dim;
   dim[0] = slice_size;
 
   ConstEigenVector eigen_src(src_pointer + src_index * slice_size, dim);
@@ -67,22 +72,23 @@ elementwise_inner_add(const framework::ExecutionContext& ctx,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                   const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssign(const phi::CPUContext& ctx,
+                   const DenseTensor& src,
+                   const DenseTensor& index,
+                   DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() =2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() =2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -99,12 +105,16 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -115,8 +125,9 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
 
-    PADDLE_ENFORCE_GE(index_, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
@@ -128,20 +139,20 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssignAdd(const phi::CPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output) {
   // check index of shape 1-D
   PADDLE_ENFORCE_EQ(
       index.dims().size() == 1 ||
           (index.dims().size() == 2 && index.dims()[1] == 1),
-      true, platform::errors::InvalidArgument(
-                "index's shape is error, "
-                "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                "but got index'dims shape is %d",
-                index.dims().size()));
+      true,
+      phi::errors::InvalidArgument(
+          "index's shape is error, "
+          "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+          "but got index'dims shape is %d",
+          index.dims().size()));
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
@@ -155,12 +166,16 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -172,36 +187,40 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   auto max_index = dst_dims[0];
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    PADDLE_ENFORCE_GE(index_val, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_val,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be greater than or equal to 0, but received [%d]",
                           index_val));
-    PADDLE_ENFORCE_LT(index_val, max_index,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_val,
+                      max_index,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be less than %d, but received %d",
-                          max_index, index_val));
+                          max_index,
+                          index_val));
     memset(p_output + slice_size * index_val, 0, slice_bytes);
   }
 
   // if not in overwrite mode, need to init output data
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_src, p_output, i, index_val, slice_size);
   }
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void CPUScatterGradForX(const phi::CPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   const IndexT* p_index = index.data<IndexT>();
@@ -216,12 +235,10 @@ void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
-                  const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU"));
-
+void ScatterNdAdd(const phi::CPUContext& ctx,
+                  const DenseTensor& update,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
   // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -250,21 +267,23 @@ void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_EQ(
-          (index_value >= 0 && index_value < output_dims[j]), true,
-          platform::errors::OutOfRange(
+          (index_value >= 0 && index_value < output_dims[j]),
+          true,
+          phi::errors::OutOfRange(
               "The index is out of bounds, "
               "please check whether the dimensions of index and "
               "input meet the requirements. It should "
               "be less than [%d] and greater or equal to 0, but received [%d]",
-              output_dims[j], index_value));
+              output_dims[j],
+              index_value));
 
       index_val += (index_value * temp);
       temp *= output_dims[j];
     }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_update, p_output, i, index_val, slice_size);
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
-- 
GitLab


From 1980e33a901efa5128e7799a83bcd35ee8ada199 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 2 Mar 2022 18:54:54 +0800
Subject: [PATCH 066/272] add check for backward hook (#40041)

* add check for backward hook

* refine ut
---
 paddle/fluid/imperative/basic_engine.cc       |  1 +
 .../fluid/imperative/gradient_accumulator.cc  |  1 +
 .../fluid/imperative/gradient_accumulator.h   | 24 ++++++++++++
 .../test_imperative_auto_mixed_precision.py   | 38 ++++++++++++++++++-
 4 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 8373c7fe50d..7416d206fc4 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -317,6 +317,7 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
         auto tmp_var = var;
         for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
           tmp_var = (*hook_pair.second)(tmp_var);
+          CheckVar(var, tmp_var);
         }
         (*tmp_ins_ptr)[pair.first][i] = tmp_var;
       }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 0abc5ad90e2..12aa13bbacc 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -732,6 +732,7 @@ void GradientAccumulator::CallGradientHooks() {
             << var_->GetVariableWrapperHooks().size();
     for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
       tmp_var = (*hook_pair.second)(tmp_var);
+      CheckVar(inner_var_, tmp_var);
     }
     inner_var_ = tmp_var;
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e74711c2a79..03f6775defc 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -179,5 +179,29 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 template <typename VarType>
 void TensorAdd(const VarType& src, VarType* dst);
 
+inline void CheckVar(const std::shared_ptr<VariableWrapper>& pre,
+                     const std::shared_ptr<VariableWrapper>& post) {
+  if (pre->IsEmpty() && !post->IsEmpty()) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "The tensor(%s) in before and after hook are not consistent",
+        pre->Name()));
+  }
+  if (!pre->IsEmpty() && !post->IsEmpty()) {
+    VLOG(4) << pre->DataType() << " " << post->DataType();
+    PADDLE_ENFORCE_EQ(
+        pre->DataType(), post->DataType(),
+        platform::errors::PermissionDenied(
+            "The dtype of tensor(%s) before(%s) and after(%s) hook are not "
+            "consistent",
+            pre->Name(), framework::DataTypeToString(pre->DataType()),
+            framework::DataTypeToString(post->DataType())));
+    PADDLE_ENFORCE_EQ(pre->Place(), post->Place(),
+                      platform::errors::PermissionDenied(
+                          "The place of tensor(%s) before(%s) and after(%s) "
+                          "hook are not consistent",
+                          pre->Name(), pre->Place(), post->Place()));
+  }
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 5cb72512f99..2011a35db68 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -1156,7 +1156,7 @@ class TestBf16(unittest.TestCase):
                         out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
 
 
-class TestPyLayerWithAmp(unittest.TestCase):
+class TestAmpWithPyLyer(unittest.TestCase):
     def test_pylayer(self):
         class MyMM(PyLayer):
             @staticmethod
@@ -1168,7 +1168,7 @@ class TestPyLayerWithAmp(unittest.TestCase):
             def backward(ctx, grad):
                 a, b = ctx.saved_tensor()
                 # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
-                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent
+                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent before.
                 return grad.mm(b.t()), a.t().mm(grad)
 
         x = paddle.rand([10, 10])
@@ -1182,5 +1182,39 @@ class TestPyLayerWithAmp(unittest.TestCase):
         loss.backward()
 
 
+class TestAmpWithHook(unittest.TestCase):
+    def test_hook_change_dtype(self):
+        with paddle.fluid.dygraph.guard():
+            v = paddle.rand([3, 3])
+            v.stop_gradient = False
+
+            def foo(grad):
+                print('grad', grad, grad.dtype)  # grad's dtype is float32
+                res = paddle.mm(grad, grad)  # mm runs in fp16
+                print('res', res, res.dtype)  # res's dtype is float16
+                return res
+
+            v.register_hook(foo)
+            with paddle.amp.auto_cast():
+                a = paddle.mm(v, v)
+                loss = a.sum()
+                self.assertRaises(RuntimeError, loss.backward)
+
+    def test_hook_change_place(self):
+        with paddle.fluid.dygraph.guard():
+            v = paddle.rand([3, 3])
+            v.stop_gradient = False
+
+            def foo(grad):
+                res = grad.cpu()  # change place
+                return res
+
+            v.register_hook(foo)
+            with paddle.amp.auto_cast():
+                a = paddle.mm(v, v)
+                loss = a.sum()
+                self.assertRaises(RuntimeError, loss.backward)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 7ef617892de37c82c02d6476a7824ceba7a4f100 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 2 Mar 2022 19:19:57 +0800
Subject: [PATCH 067/272] [IPU] update dockerfile (#40061)

* update dockerfile for ipu

* update comments, test=document_fix
---
 tools/dockerfile/Dockerfile.ipu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 68bd841c375..715bd34b908 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -1,11 +1,12 @@
 # A image for building paddle binaries
-# docker build -f Dockerfile.ipu -t paddlepaddle/paddle:latest-ipu-dev .
 
-# /usr/bin/docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK \
-# --device=/dev/infiniband/ --ipc=host --name paddle-with-dev -v $PWD:/paddle \
-# -it paddlepaddle/paddle:latest-ipu-dev /bin/bash
+# build docker image
+# docker build -t paddlepaddle/paddle:ipu-dev-2.3.0 -f tools/dockerfile/Dockerfile.ipu .
 
-FROM graphcore/poplar:latest
+# run a container
+# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:ipu-dev-2.3.0 bash
+
+FROM graphcore/poplar:2.3.0
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
-- 
GitLab


From ebc6959c9becf0bf0eab6eb260da45b96afc80ea Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Wed, 2 Mar 2022 19:35:10 +0800
Subject: [PATCH 068/272] modify infershape of yolo_box (#40056)

* modify infershape of yolo_box
---
 paddle/fluid/operators/detection/yolo_box_op.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 48b0d511d90..511d8e0eed1 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -102,7 +102,12 @@ class YoloBoxOp : public framework::OperatorWithKernel {
                           "But received class_num (%s)",
                           class_num));
 
-    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    int box_num;
+    if ((dim_x[2] > 0 && dim_x[3] > 0) || ctx->IsRuntime()) {
+      box_num = dim_x[2] * dim_x[3] * anchor_num;
+    } else {
+      box_num = -1;
+    }
     std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
     ctx->SetOutputDim("Boxes", phi::make_ddim(dim_boxes));
 
-- 
GitLab


From 3fc698fb16998305697cc22bbb5c49369681b9fe Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 2 Mar 2022 19:37:43 +0800
Subject: [PATCH 069/272] Adjust GPU Arches for next level Whl release strategy
 (#39910)

* Adjust GPU Arches for Whl releases

* Adjusted CUDA arches

* fixed minor issue

* adjusted gpu arches
---
 CMakeLists.txt   |  3 ++-
 cmake/cuda.cmake | 20 +++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b499fb43ab..4c5f711d291 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -238,7 +238,8 @@ option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
 option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
-option(NEW_RELEASE_CUBIN   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_PYPI   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_ALL   "PaddlePaddle next-level release strategy for all arches cubin package"             OFF)
 option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
 option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 34c079ba71c..312a0305244 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,16 +6,22 @@ if(WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
-elseif(NEW_RELEASE_CUBIN)
+elseif(NEW_RELEASE_ALL)
+  message("Using New Release Strategy - All Arches Packge")
+  add_definitions(-DNEW_RELEASE_ALL)
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
-  add_definitions(-DNEW_RELEASE_CUBIN)
-  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "50 60 70 75")
-  set(paddle_known_gpu_archs11 "60 70 75 80")
+  add_definitions(-DNEW_RELEASE_PYPI)
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "")
+  set(paddle_known_gpu_archs11 "60 61 70 75 80")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Packge")
   add_definitions(-DNEW_RELEASE_JIT)
-  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
   set(paddle_known_gpu_archs10 "35 50 60 70 75")
   set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
 else()
@@ -148,7 +154,7 @@ function(select_nvcc_arch_flags out_variable)
 
   # remove dots and convert to lists
   string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
   string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
 
-- 
GitLab


From f3d54e2eaa668a04c230cab2291e4b222daed4b9 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 2 Mar 2022 20:49:28 +0800
Subject: [PATCH 070/272] Move sgd to phi (#40045)

* move sgd to phi; test=develop

* update

* add sgd kernel; test=develop
---
 paddle/fluid/framework/operator.cc            |   6 +-
 .../operators/optimizers/dgc_momentum_op.h    |  65 +++++-
 paddle/fluid/operators/optimizers/sgd_op.cc   |   5 -
 paddle/fluid/operators/optimizers/sgd_op.cu   |   7 -
 paddle/phi/core/kernel_registry.h             |   6 +
 paddle/phi/core/kernel_utils.h                |   1 +
 paddle/phi/kernels/cpu/sgd_kernel.cc          | 213 ++++++++++++++++++
 paddle/phi/kernels/gpu/sgd_kernel.cu          | 209 +++++++++++++++++
 paddle/phi/kernels/sgd_kernel.h               |  54 +++++
 paddle/phi/ops/compat/sgd_sig.cc              |  44 ++++
 10 files changed, 592 insertions(+), 18 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/sgd_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/sgd_kernel.cu
 create mode 100644 paddle/phi/kernels/sgd_kernel.h
 create mode 100644 paddle/phi/ops/compat/sgd_sig.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ffdc3e6d3c2..6414dd455db 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2051,7 +2051,11 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
-         std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
+             std::type_index(
+                 typeid(paddle::optional<const phi::DenseTensor&>)) ||
+         input_defs[i].type_index ==
+             std::type_index(
+                 typeid(paddle::optional<const phi::SelectedRows&>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index bea019f1f36..c86f544ed77 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -17,7 +17,7 @@
 #include <memory>
 
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
+#include "paddle/phi/kernels/sgd_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -26,8 +26,7 @@ template <typename DeviceContext, typename T>
 class DGCMomentumKernel : public framework::OpKernel<T> {
  public:
   DGCMomentumKernel()
-      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()),
-        _sgd_op_kernel(new SGDOpKernel<DeviceContext, T>()) {}
+      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()) {}
 
   void Compute(const framework::ExecutionContext& context) const override {
     auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
@@ -67,12 +66,68 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
     }
 
     VLOG(10) << " so use sgd optimizer";
-    return _sgd_op_kernel->Compute(context);
+
+    const auto* param_var = context.InputVar("Param");
+    const auto* grad_var = context.InputVar("Grad");
+    auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
+    bool multi_precision = context.Attr<bool>("multi_precision");
+    if (param_var->IsType<framework::LoDTensor>()) {
+      auto* param = context.Input<framework::Tensor>("Param");
+      auto* param_out = context.Output<framework::Tensor>("ParamOut");
+      auto* master_param_out =
+          context.Output<framework::Tensor>("MasterParamOut");
+      paddle::optional<const framework::Tensor&> master_param_opt =
+          paddle::none;
+      if (multi_precision) {
+        auto* master_param = context.Input<framework::Tensor>("MasterParam");
+        master_param_opt = *master_param;
+      }
+
+      if (grad_var->IsType<framework::Tensor>()) {
+        // sgd_dense
+        auto* grad = context.Input<framework::Tensor>("Grad");
+        phi::SGDDenseKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *learning_rate, *grad, master_param_opt, multi_precision,
+            param_out, master_param_out);
+      } else {
+        // sgd dense param sparse grad
+        auto* grad = context.Input<phi::SelectedRows>("Grad");
+        phi::SGDDenseParamSparseGradKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *learning_rate, *grad, master_param_opt, multi_precision,
+            param_out, master_param_out);
+      }
+    } else if (param_var->IsType<phi::SelectedRows>() &&
+               grad_var->IsType<phi::SelectedRows>() &&
+               platform::is_cpu_place(context.GetPlace())) {
+      // sgd sparse param sparse grad
+      auto* param = context.Input<phi::SelectedRows>("Param");
+      auto* param_out = context.Output<phi::SelectedRows>("ParamOut");
+      auto* master_param_out =
+          context.Output<phi::SelectedRows>("MasterParamOut");
+      paddle::optional<const phi::SelectedRows&> master_param_opt =
+          paddle::none;
+      if (multi_precision) {
+        auto* master_param = context.Input<phi::SelectedRows>("MasterParam");
+        master_param_opt = *master_param;
+      }
+      auto* grad = context.Input<phi::SelectedRows>("Grad");
+      phi::SGDSparseParamSparseGradKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *param, *learning_rate, *grad, master_param_opt, multi_precision,
+          param_out, master_param_out);
+
+    } else {
+      PADDLE_THROW("gdc not support yet");
+    }
   }
 
  private:
   std::unique_ptr<MomentumOpKernel<DeviceContext, T>> _momentum_op_kernel;
-  std::unique_ptr<SGDOpKernel<DeviceContext, T>> _sgd_op_kernel;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 529d60a2820..0e3f895d276 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -166,8 +166,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::SGDOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 3149f5f56ed..222244a2fd1 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -166,10 +166,3 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
 };
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 7a05452cbeb..2b04d173af0 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -81,6 +81,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SelectedRows&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type ==
                  std::type_index(typeid(const std::vector<DenseTensor>&))) {
         args_def->AppendInput(default_key.backend(),
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index e5de5e2b49e..b582375155a 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -219,6 +219,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
new file mode 100644
index 00000000000..c7b4074c70a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+void sgd_dense_param_dense_grad_impl(const DenseTensor& param,
+                                     const DenseTensor& learning_rate,
+                                     const DenseTensor& grad,
+                                     DenseTensor* param_out) {
+  const auto sz = param_out->numel();
+  paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+  const T* lr = learning_rate.data<T>();
+  const T* param_data = param.data<T>();
+  const T* grad_data = grad.data<T>();
+  int64_t rows_idx = 0;
+  T* out_data = param_out->data<T>();
+
+  auto sgd =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(attr);
+  sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
+}
+
+template <>
+void sgd_dense_param_dense_grad_impl<phi::dtype::bfloat16>(
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const DenseTensor& grad,
+    DenseTensor* param_out) {
+  auto p = EigenVector<phi::dtype::bfloat16>::Flatten(param);
+  auto g = EigenVector<phi::dtype::bfloat16>::Flatten(grad);
+  auto o = EigenVector<phi::dtype::bfloat16>::Flatten(*param_out);
+  const auto* lr = learning_rate.data<phi::dtype::bfloat16>();
+
+  o = p - lr[0] * g;
+}
+
+template <typename T>
+void sgd_dense_param_sparse_grad_impl(const DenseTensor& param,
+                                      const DenseTensor& learning_rate,
+                                      const SelectedRows& grad,
+                                      DenseTensor* param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const T* param_data = param.data<T>();
+  const T* grad_data = grad_value.data<T>();
+  const T* lr = learning_rate.data<T>();
+  const int64_t* rows_data = grad_rows.data();
+  T* out_data = param_out->data<T>();
+
+  paddle::operators::jit::sgd_attr_t attr;
+  attr.param_height = param_out->dims()[0];
+  attr.param_width = param_out->numel() / attr.param_height;
+  attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+  attr.grad_width = grad_value.numel() / attr.grad_height;
+  attr.selected_rows_size = grad_rows.size();
+
+  auto sgd =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(attr);
+  sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
+}
+
+template <>
+void sgd_dense_param_sparse_grad_impl<phi::dtype::bfloat16>(
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    DenseTensor* param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const auto grad_height = grad.height();
+  const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+  const auto grad_width = grad_value.numel() / grad_val_height;
+
+  const auto* grad_data = grad_value.data<phi::dtype::bfloat16>();
+  auto* out_data = param_out->data<phi::dtype::bfloat16>();
+  const auto* lr = learning_rate.data<phi::dtype::bfloat16>();
+
+  for (size_t i = 0; i < grad_rows.size(); ++i) {
+    PADDLE_ENFORCE_LT(
+        grad_rows[i],
+        grad_height,
+        phi::errors::OutOfRange(
+            "Grad rows index value should be less than grad height."
+            "Got [%s], but expected less than [%s]",
+            grad_rows[i],
+            grad_height));
+    const int64_t row = grad_rows[i];
+    for (int64_t j = 0; j < grad_width; ++j) {
+      out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  sgd_dense_param_dense_grad_impl<T>(param, learning_rate, grad, param_out);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  sgd_dense_param_sparse_grad_impl<T>(param, learning_rate, grad, param_out);
+}
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out) {
+  // for distributed training, a sparse var may be empty,
+  // just skip updating.
+  if (grad.rows().size() == 0) {
+    return;
+  }
+
+  auto param_row_width = param.value().dims()[1];
+  auto grad_row_width = grad.value().dims()[1];
+  PADDLE_ENFORCE_EQ(
+      param_row_width,
+      grad_row_width,
+      phi::errors::InvalidArgument(
+          "The param_row in SgdOP should have the same size with grad_row. "
+          "But received param_row's width is [%s], and grad_row's width is "
+          "[%s]",
+          param_row_width,
+          grad_row_width));
+
+  const auto* lr = learning_rate.data<T>();
+  const auto* grad_data = grad.value().data<T>();
+  auto* out_data = param_out->mutable_value()->data<T>();
+  for (size_t i = 0; i < grad.rows().size(); i++) {
+    int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
+    PADDLE_ENFORCE_GE(
+        id_index,
+        static_cast<int64_t>(0),
+        phi::errors::InvalidArgument(
+            "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+            id_index));
+    for (int64_t j = 0; j < grad_row_width; j++) {
+      out_data[id_index * grad_row_width + j] -=
+          lr[0] * grad_data[i * grad_row_width + j];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sgd,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_sparse_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDSparseParamSparseGradKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
new file mode 100644
index 00000000000..7dd5a03383f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename MT>
+__global__ void SGDKernelMT(const T* param,
+                            const T* grad,
+                            const T* learning_rate,
+                            const int num,
+                            T* param_out,
+                            const MT* master_param,
+                            MT* master_param_out) {
+  MT lr = static_cast<MT>(learning_rate[0]);
+  CUDA_KERNEL_LOOP(i, num) {
+    MT p_data = master_param ? master_param[i] : static_cast<MT>(param[i]);
+    MT g_data = static_cast<MT>(grad[i]);
+    p_data = p_data - lr * g_data;
+    param_out[i] = static_cast<T>(p_data);
+    if (master_param_out) {
+      master_param_out[i] = p_data;
+    }
+  }
+}
+
+template <typename T>
+__global__ void SparseSGDFunctorKernel(const T* selected_rows,
+                                       const int64_t* rows,
+                                       const T* learning_rate,
+                                       T* tensor_out,
+                                       int64_t row_numel,
+                                       int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  using MPDType = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  // do check here
+  // if (multi_precision) {
+  //   bool has_master =
+  //       ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+
+  // }
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision
+          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+          : nullptr;
+
+  int block = 512;
+  int grid = (param.numel() + block - 1) / block;
+
+  SGDKernelMT<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
+      param.data<T>(),
+      grad.data<T>(),
+      learning_rate.data<T>(),
+      param.numel(),
+      param_out->mutable_data<T>(dev_ctx.GetPlace()),
+      master_in_data,
+      master_out_data);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  using MPDType = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  // do some check here
+  // if (multi_precision) {
+  //   bool has_master =
+  //       ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+
+  // }
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision
+          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+          : nullptr;
+
+  PADDLE_ENFORCE_EQ(
+      &param,
+      param_out,
+      phi::errors::InvalidArgument(
+          "The input tensor Param of SgdOp should be equal with ParamOut "
+          "if variable's type is SelectedRows."));
+
+  auto in_height = grad.height();
+  auto out_dims = param_out->dims();
+  PADDLE_ENFORCE_EQ(in_height,
+                    out_dims[0],
+                    phi::errors::InvalidArgument(
+                        "The input tensor Grad's height of SgdOp should be "
+                        "equal with ParamOut's dims. But received Grad's "
+                        "height [%s] and ParamOut's dims [%s]",
+                        in_height,
+                        out_dims[0]));
+
+  auto& in_value = grad.value();
+  auto& in_rows = grad.rows();
+
+  int64_t in_row_numel = in_value.numel() / in_rows.size();
+  PADDLE_ENFORCE_EQ(in_row_numel,
+                    param_out->numel() / in_height,
+                    phi::errors::InvalidArgument(
+                        "The in_row_numel of SgdOp should be equal with "
+                        "param_out's numel / in_height."));
+
+  auto* in_data = in_value.data<T>();
+  auto* out_data = param_out->data<T>();
+
+  const int kThreadsPerBlock = 256;
+  int thread_x = kThreadsPerBlock;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+  paddle::framework::MixVector<int64_t> mixv_in_rows(&in_rows);
+  SparseSGDFunctorKernel<<<max_blocks, thread_x, 0, dev_ctx.stream()>>>(
+      in_data,
+      mixv_in_rows.CUDAData(dev_ctx.GetPlace()),
+      learning_rate.data<T>(),
+      out_data,
+      in_row_numel,
+      in_rows.size());
+}
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out) {
+  PADDLE_THROW("not impl");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_sparse_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDSparseParamSparseGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/sgd_kernel.h b/paddle/phi/kernels/sgd_kernel.h
new file mode 100644
index 00000000000..12361c738e2
--- /dev/null
+++ b/paddle/phi/kernels/sgd_kernel.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/sgd_sig.cc b/paddle/phi/ops/compat/sgd_sig.cc
new file mode 100644
index 00000000000..cdf1a221f7e
--- /dev/null
+++ b/paddle/phi/ops/compat/sgd_sig.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SGDOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Grad")) {
+    return KernelSignature("sgd",
+                           {"Param", "LearningRate", "Grad", "MasterParam"},
+                           {"multi_precision"},
+                           {"ParamOut", "MasterParamOut"});
+  } else if (ctx.IsSelectedRowsInput("Grad")) {
+    if (ctx.IsDenseTensorInput("Param")) {
+      return KernelSignature("sgd_dense_param_sparse_grad",
+                             {"Param", "LearningRate", "Grad", "MasterParam"},
+                             {"multi_precision"},
+                             {"ParamOut", "MasterParamOut"});
+    } else {
+      return KernelSignature("sgd_sparse_param_sparse_grad",
+                             {"Param", "LearningRate", "Grad", "MasterParam"},
+                             {"multi_precision"},
+                             {"ParamOut", "MasterParamOut"});
+    }
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(sgd, phi::SGDOpArgumentMapping);
-- 
GitLab


From a8e02ef1b6c8f193304211706c118ea673129dbd Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Wed, 2 Mar 2022 20:52:19 +0800
Subject: [PATCH 071/272] [MLU] add mlu ci script (#39805)

* [MLU] add mlu ci script

* Update CMakeLists.txt
---
 .../operators/mlu/activation_op_mlu_test.cc   |   1 -
 .../fluid/operators/uniform_random_op_mlu.cc  |  46 +++++-
 paddle/scripts/paddle_build.sh                | 133 ++++++++++++++++++
 .../unittests/mlu/test_accuracy_op_mlu.py     |   3 +-
 .../unittests/mlu/test_batch_norm_op_mlu.py   |   2 +-
 .../mlu/test_batch_norm_op_mlu_v2.py          |   3 +-
 .../tests/unittests/mlu/test_cast_op_mlu.py   |  12 +-
 .../tests/unittests/mlu/test_concat_op_mlu.py |   2 +-
 .../mlu/test_elementwise_add_op_mlu.py        |   3 +-
 .../mlu/test_fill_constant_op_mlu.py          |   3 +-
 .../mlu/test_gaussian_random_op_mlu.py        |   3 +-
 .../unittests/mlu/test_momentum_op_mlu.py     |   3 +-
 .../tests/unittests/mlu/test_pool2d_op_mlu.py |   3 +-
 .../tests/unittests/mlu/test_scale_op_mlu.py  |   3 +-
 .../tests/unittests/mlu/test_top_k_op_mlu.py  |   3 +-
 15 files changed, 194 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index f8828628831..88452130175 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
-namespace math = paddle::operators::math;
 
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MLU);
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 1600bedc6b2..2c5f13f5a93 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/uniform_random_op.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -57,14 +58,45 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
 
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
-    const float min = static_cast<T>(ctx.Attr<float>("min"));
-    const float max = static_cast<T>(ctx.Attr<float>("max"));
+
+    Tensor cpu_tensor(tensor->dtype());
+    cpu_tensor.Resize(tensor->dims());
+    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    // make mlu seed
-    MLUCnnlRandomGeneratorDesc random_desc(/*is_mlu200=*/false, seed);
-    cnnlDataType_t data_type = ToCnnlDataType(tensor->type());
-    MLUCnnl::RandomUniform(ctx, size, /*data type=*/data_type,
-                           random_desc.get(), min, max, GetBasePtr(tensor));
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data_cpu[i] = dist(*engine);
+    }
+
+    unsigned int diag_num =
+        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
+    unsigned int diag_step =
+        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
+    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
+    if (diag_num > 0) {
+      PADDLE_ENFORCE_GT(
+          size, (diag_num - 1) * (diag_step + 1),
+          platform::errors::InvalidArgument(
+              "ShapeInvalid: the diagonal's elements is equal (num-1) "
+              "* (step-1) with num %d, step %d,"
+              "It should be smaller than %d, but received %d",
+              diag_num, diag_step, (diag_num - 1) * (diag_step + 1), size));
+      for (int64_t i = 0; i < diag_num; ++i) {
+        int64_t pos = i * diag_step + i;
+        data_cpu[pos] = diag_val;
+      }
+    }
+
+    // copy to MLU
+    framework::TensorCopy(
+        cpu_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), tensor);
+    ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
   }
 };
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ed70a8638bf..41e5e0469dc 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1269,6 +1269,8 @@ function card_test() {
         CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
         CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
+    elif [ "${WITH_MLU}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -2102,6 +2104,130 @@ set -ex
     fi   
 }
 
+function parallel_test_base_mlu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/mlu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit mlu tests ...
+    ========================================
+EOF
+
+set +x
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+
+        ut_actual_total_startTime_s=`date +%s`
+
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single MLU
+        collect_failed_tests
+
+        # add unit test retry for MLU
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                fi
+
+                            done
+
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                    else 
+                        break
+                    fi
+
+                done
+        fi
+
+        rerun_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -2117,6 +2243,8 @@ function parallel_test() {
         parallel_test_base_xpu
     elif [ "$WITH_ASCEND_CL" == "ON" ];then
         parallel_test_base_npu
+    elif [ "$WITH_MLU" == "ON" ];then
+        parallel_test_base_mlu
     else
         parallel_test_base_cpu ${PROC_RUN:-1}
     fi
@@ -2873,6 +3001,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_mlu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
index e229966c12d..5e5c4c9a301 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
@@ -23,6 +23,8 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestAccuracyOp(OpTest):
     def setUp(self):
@@ -132,5 +134,4 @@ class TestAccuracyAPI(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 2150e06381f..4cbff21dfc4 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -29,6 +29,7 @@ import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
 _set_use_system_allocator(True)
+paddle.enable_static()
 
 
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
@@ -698,5 +699,4 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index f608344f6e0..7dd9dcdee57 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -26,6 +26,8 @@ import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import paddle
 
+paddle.enable_static()
+
 
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
@@ -291,5 +293,4 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
index 71f79c34d23..10356b124b2 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
@@ -25,6 +25,8 @@ import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestCastOpFp32ToFp16(OpTest):
     def setUp(self):
@@ -119,17 +121,7 @@ class TestCastOpError(unittest.TestCase):
             x1 = fluid.create_lod_tensor(
                 np.array([[-1]]), [[1]], fluid.MLUPlace(0))
             self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
-            # The input dtype of cast_op must be bool, float16, float32, float64, int32, int64, uint8.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='int16')
-            self.assertRaises(TypeError, fluid.layers.cast, x2, 'int32')
-
-            def test_dtype_type():
-                x4 = fluid.layers.data(name='x4', shape=[4], dtype='int32')
-                output = fluid.layers.cast(x=x4, dtype='int16')
-
-            self.assertRaises(TypeError, test_dtype_type)
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
index 3bfa96b7001..ba37fcee154 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
@@ -176,7 +176,7 @@ def create_test_AxisTensor(parent):
     class TestConcatAxisTensor(parent):
         def setUp(self):
             self.op_type = "concat"
-            self.dtype = self.init_dtype()
+            self.init_dtype()
             self.init_test_data()
 
             self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
index 5b6db6903fb..3dc711c7d75 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
@@ -23,6 +23,8 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestElementwiseAddOp(OpTest):
     def set_mlu(self):
@@ -523,5 +525,4 @@ class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
index 6610127d382..a43b7d0164d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
@@ -27,6 +27,8 @@ import paddle.fluid as fluid
 import numpy as np
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
@@ -449,5 +451,4 @@ class TestFillConstantOpError(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
index 97a945dc905..6f64196a586 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
@@ -26,6 +26,8 @@ sys.path.append('..')
 from op_test import OpTest
 import paddle
 
+paddle.enable_static()
+
 
 class TestGaussianRandomOp(OpTest):
     def setUp(self):
@@ -74,5 +76,4 @@ class TestMeanStdAreInt(TestGaussianRandomOp):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
index af09eabe787..a2cd69fee32 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
@@ -26,6 +26,8 @@ import paddle.fluid as fluid
 import numpy
 from test_momentum_op import calculate_momentum_by_numpy
 
+paddle.enable_static()
+
 
 class TestMomentumOp1(OpTest):
     def setUp(self):
@@ -608,5 +610,4 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
index fd442c6205e..1be3d2d85a4 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
@@ -27,6 +27,8 @@ sys.path.append('..')
 from op_test import OpTest
 from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
 
+paddle.enable_static()
+
 
 def pool2d_backward_navie(x,
                           ksize,
@@ -1016,5 +1018,4 @@ class TestDygraphPool2DAPI(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
index bb7f438c4ab..53254c738d9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
@@ -25,6 +25,8 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.static import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestScaleOp(OpTest):
     def setUp(self):
@@ -201,5 +203,4 @@ class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
index 8ad0e787ab0..366f783ce0d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
@@ -22,6 +22,8 @@ from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
 
+paddle.enable_static()
+
 
 class TestTopkOp(OpTest):
     def setUp(self):
@@ -69,5 +71,4 @@ class TestTopkFP16Op(TestTopkOp):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
-- 
GitLab


From 272b32fd8530cf1ddf56f508376c3120864a8a86 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Wed, 2 Mar 2022 23:21:34 +0800
Subject: [PATCH 072/272] Replacing dropout eval eigen usage by cuda kernel
 (#40053)

* Replacing dropout eval eigen usage by cuda kernel
---
 paddle/fluid/operators/dropout_impl.cu.h | 28 +++++++++++++++++-------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 2fa956a2e65..cdcf683fb92 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                               bool is_fix_seed, int seed_val, const Tensor& x,
                               const Tensor* seed, Tensor* mask, Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
+  int64_t x_numel = x.numel();
+  auto stream = dev_ctx.stream();
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
 
   if (!is_test) {
-    int64_t x_numel = x.numel();
-    auto stream = dev_ctx.stream();
     auto* mask_data = mask->data<uint8_t>();
     size_t size = phi::product(mask->dims());
 
-    auto* x_data = x.data<T>();
-    auto* y_data = y->data<T>();
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     }
 #endif
   } else {
-    auto X = EigenMatrix<T>::Reshape(x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
     if (upscale_in_train) {
-      Y.device(place) = X;
+// todo: can y share with data with x directly?
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                         hipMemcpyDeviceToDevice, stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                          cudaMemcpyDeviceToDevice, stream));
+#endif
     } else {
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      T factor = static_cast<T>(1.0f - dropout_prob);
+      std::vector<const framework::Tensor*> ins = {&x};
+      std::vector<framework::Tensor*> outs = {y};
+      auto functor = phi::funcs::ScaleFunctor<T>(factor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 }
-- 
GitLab


From c16f85f95d0c42989e22c5ebae709f60506111a0 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 3 Mar 2022 01:24:26 +0800
Subject: [PATCH 073/272] Add the implementation of Gloo for ProcessGroup
 (#39892)

* add pg_gloo
---
 .../distributed/collective/CMakeLists.txt     |   3 +
 .../collective/ProcessGroupGloo.cc            | 308 ++++++++++++++++++
 .../distributed/collective/ProcessGroupGloo.h | 138 ++++++++
 paddle/fluid/distributed/store/store.h        |   2 +
 paddle/fluid/distributed/store/tcp_store.cc   |  85 +++--
 paddle/fluid/distributed/store/tcp_store.h    |  12 +-
 paddle/fluid/distributed/store/tcp_utils.cc   |   3 +-
 paddle/fluid/pybind/CMakeLists.txt            |   3 +
 paddle/fluid/pybind/communication.cc          |  12 +-
 paddle/fluid/pybind/distributed_py.cc         |  54 ++-
 .../tests/unittests/process_group_gloo.py     | 119 +++++++
 .../test_collective_process_group.py          |   3 +
 12 files changed, 701 insertions(+), 41 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupGloo.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupGloo.h
 create mode 100644 python/paddle/fluid/tests/unittests/process_group_gloo.py

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index a5b40f8aa07..96bc4a710f8 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,4 +1,7 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+if (WITH_DISTRIBUTE)
+  cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
+endif()
 cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
new file mode 100644
index 00000000000..03ad48f560a
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <gloo/broadcast.h>
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+
+#define HOST_NAME_MAX 256
+
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+
+bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kCPU;
+  });
+}
+
+template <typename T>
+T* get_data(const Tensor& tensor) {
+  auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+  return static_cast<T*>(raw_tensor->data());
+}
+
+template <typename T>
+std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
+  std::vector<T*> ret(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret[i] = get_data<T>(tensors[i]);
+  }
+  return ret;
+}
+
+template <typename T, typename P>
+void set_output(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_input(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_outputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+ProcessGroupGloo::GlooTask::GlooTask(int rank,
+                                     const std::vector<Tensor>& inputs,
+                                     CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {
+  PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
+                    platform::errors::Fatal(
+                        "Only CPU place is supported for ProcessGroupGloo."));
+}
+
+ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
+                                   int rank, int world_size,
+                                   const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    const std::vector<Tensor>& inputs, int rank, int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _tag(tag) {}
+
+  void Run() override { _do_broadcast(_inputs[0]); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<Tensor> _inputs{};
+  const uint32_t _tag;
+
+  void _do_broadcast(const Tensor& tensor) {
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = tensor.type();
+    GENERATE_FUNC(dtype, set_output, opts, tensor);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs, ReduceOp reduce_op,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+
+  void Run() override { _do_allreduce(_inputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_allreduce(std::vector<Tensor>& tensors) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, tensors);
+    GENERATE_FUNC(dtype, set_outputs, opts, tensors);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
+                                             opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
+                                "Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
new file mode 100644
index 00000000000..d989939fcb8
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <future>
+#include <mutex>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+
+namespace paddle {
+namespace distributed {
+
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
+                      CommType comm_type);
+
+    ~GlooTask() = default;
+
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+
+   protected:
+    friend class ProcessGroupGloo;
+  };
+
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(
+        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+        : _store(store) {}
+
+    ~GlooStore() = default;
+
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+
+   protected:
+    std::shared_ptr<paddle::distributed::TCPStore> _store;
+  };
+
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+
+  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
+                            int world_size,
+                            std::shared_ptr<GlooOptions> options);
+
+  ~ProcessGroupGloo() = default;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& inputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& inputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<GlooStore> _store;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
index 2673314d222..2581a74d7e8 100644
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -32,6 +32,8 @@ class Store {
   virtual int64_t add(const std::string& key, int64_t value) = 0;
   virtual std::vector<uint8_t> get(const std::string& key) = 0;
   virtual void wait(const std::string& key) = 0;
+  virtual void set(const std::string& key,
+                   const std::vector<uint8_t>& value) = 0;
 
   virtual const std::chrono::seconds& timeout() const { return _timeout; }
 
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index de85ac0d910..8675981955d 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -27,11 +27,13 @@ namespace detail {
 
 constexpr int INFTIME = -1;
 
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
-  return std::make_unique<MasterDaemon>(socket);
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
+                                                  int nranks) {
+  return std::make_unique<MasterDaemon>(socket, nranks);
 }
 
-MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks)
+    : _listen_socket(socket), _nranks(nranks) {
   _background_thread = std::thread{&MasterDaemon::run, this};
 }
 
@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) {
   tcputils::send_value<int64_t>(socket, new_value);
 }
 
+void MasterDaemon::_do_set(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_set";
+  std::string key = tcputils::receive_string(socket);
+  auto value = tcputils::receive_vector<uint8_t>(socket);
+  _store[key] = value;
+}
+
 void MasterDaemon::_do_get(SocketType socket) {
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) {
       iter, _store.end(),
       platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
   std::vector<uint8_t> value = iter->second;
-  VLOG(3) << "TCPStore: value ("
-          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
-                                    value.size()))
-          << ") for key (" << key << ").";
   tcputils::send_vector<uint8_t>(socket, value);
 }
 
 void MasterDaemon::_do_stop(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_stop";
   ReplyType value = ReplyType::STOP_WAIT;
-  _stop = true;
+  if (--_nranks == 0) {
+    _stop = true;
+  }
   tcputils::send_value<ReplyType>(socket, value);
 }
 
@@ -140,21 +148,27 @@ void MasterDaemon::run() {
         case Command::GET:
           _do_get(fds[i].fd);
           break;
+        case Command::SET:
+          _do_set(fds[i].fd);
+          break;
         case Command::WAIT:
           _do_wait(fds[i].fd);
           break;
         case Command::STOP:
           _do_stop(fds[i].fd);
           break;
+        default:
+          VLOG(0) << "Unknow command: " << static_cast<int>(command);
+          exit(-1);
       }
     }
   }
 }
 
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
   int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
   auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket);
+  server->_master_daemon = MasterDaemon::start(socket, nranks);
   return server;
 }
 
@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
                    size_t num_workers, std::chrono::seconds timeout)
     : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
   if (_is_master) {
-    _server = detail::TCPServer::create(port);
+    _server = detail::TCPServer::create(port, num_workers);
   }
 
   _client = detail::TCPClient::connect(host, port);
@@ -213,36 +227,41 @@ void TCPStore::waitWorkers() {
   }
   add(_init_key, 1);
 
-  if (_server) {
-    auto begin = std::chrono::steady_clock::now();
-    do {
-      auto value = get(_init_key);
-      int completed = std::stoi(std::string(value.begin(), value.end()));
-      VLOG(3) << completed << " worker ready, total " << _num_workers;
-      if (completed >= _num_workers) {
-        break;
-      }
-      const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
-          std::chrono::steady_clock::now() - begin);
-
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
-        PADDLE_ENFORCE_EQ(
-            completed, _num_workers,
-            platform::errors::InvalidArgument(
-                "TCPStore timeouted and not all workers got ready."));
-      }
-    } while (true);
-  }
+  auto begin = std::chrono::steady_clock::now();
+  do {
+    auto value = get(_init_key);
+    int completed = std::stoi(std::string(value.begin(), value.end()));
+    VLOG(3) << completed << " worker ready, total " << _num_workers;
+    if (completed >= _num_workers) {
+      break;
+    }
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - begin);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
+      PADDLE_ENFORCE_EQ(
+          completed, _num_workers,
+          platform::errors::InvalidArgument(
+              "TCPStore timeouted and not all workers got ready."));
+    }
+  } while (true);
   VLOG(3) << "TCPStore initialized.";
 }
 
 int64_t TCPStore::add(const std::string& key, int64_t value) {
+  VLOG(3) << "TCPStore add.";
   _client->send_command_for_key(Command::ADD, _key_prefix + key);
   _client->send_value<std::int64_t>(value);
   return _client->receive_value<std::int64_t>();
 }
 
+void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
+  VLOG(3) << "TCPStore set.";
+  _client->send_command_for_key(Command::SET, _key_prefix + key);
+  _client->send_vector<std::uint8_t>(value);
+}
+
 std::vector<uint8_t> TCPStore::get(const std::string& key) {
   wait(key);
   _client->send_command_for_key(Command::GET, _key_prefix + key);
@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 
 void TCPStore::wait(const std::string& key) {
   ReplyType reply;
+  VLOG(3) << "TCPStore wait.";
   do {
     _client->send_command_for_key(Command::WAIT, _key_prefix + key);
 
@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) {
 
 TCPStore::~TCPStore() {
   _client->send_command_for_key(Command::STOP, "");
+  VLOG(3) << "~TCPStore";
   ReplyType ret = _client->receive_value<ReplyType>();
   PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
index cd706dd6640..17c1d8ea30a 100644
--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -27,15 +27,16 @@ namespace paddle {
 namespace distributed {
 
 enum class ReplyType { WAITING, STOP_WAIT };
-enum class Command { ADD, GET, WAIT, STOP };
+enum class Command { ADD, GET, SET, WAIT, STOP };
 
 namespace detail {
 
 class MasterDaemon {
  public:
-  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
+                                             int nranks);
   MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket);
+  explicit MasterDaemon(SocketType listen_socket, int nranks);
   ~MasterDaemon();
 
  private:
@@ -43,18 +44,20 @@ class MasterDaemon {
   void _do_add(SocketType socket);
   void _do_wait(SocketType socket);
   void _do_get(SocketType socket);
+  void _do_set(SocketType socket);
   void _do_stop(SocketType socket);
   SocketType _listen_socket;
   std::vector<SocketType> _sockets;
   std::unordered_map<std::string, std::vector<uint8_t>> _store;
   std::thread _background_thread{};
+  int _nranks;
   bool _stop = false;
 };
 
 class TCPServer {
  public:
   TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
 
  private:
   std::unique_ptr<MasterDaemon> _master_daemon;
@@ -97,6 +100,7 @@ class TCPStore : public Store {
   int64_t add(const std::string& key, int64_t value) override;
   std::vector<uint8_t> get(const std::string& key) override;
   void wait(const std::string& key) override;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
  private:
   void waitWorkers();
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
index d0561d0b9a9..a28cba28833 100644
--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
   hints.ai_socktype = SOCK_STREAM;
 
   const char* node = host.empty() ? nullptr : host.c_str();
+  const char* port_cstr = port.empty() ? nullptr : port.c_str();
 
   int n;
-  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
   const char* proto =
       (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 48d42f803a8..5e61133510d 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -85,6 +85,9 @@ if(NOT ON_INFER)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
   endif()
+  if (WITH_GLOO)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
+  endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()
 
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index a0d2777f825..c01accaf598 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -31,9 +31,15 @@ namespace pybind {
 using TCPStore = paddle::distributed::TCPStore;
 
 void BindTCPStore(py::module* m) {
-  py::class_<TCPStore>(*m, "TCPStore")
-      .def(
-          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
+      .def(py::init([](std::string hostname, uint16_t port, bool is_master,
+                       size_t world_size, std::chrono::seconds timeout) {
+             return std::make_shared<TCPStore>(hostname, port, is_master,
+                                               world_size, timeout);
+           }),
+           py::arg("hostname"), py::arg("port"), py::arg("is_master"),
+           py::arg("world_size"), py::arg("timeout"),
+           py::call_guard<py::gil_scoped_release>())
       .def("add", &TCPStore::add)
       .def("get", &TCPStore::get);
 }
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index a4a1d07db2c..3b5644764a5 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -35,6 +35,11 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
 
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#endif
+
 namespace py = pybind11;
 
 namespace paddle {
@@ -42,6 +47,14 @@ namespace pybind {
 
 using Tensor = paddle::experimental::Tensor;
 
+#if defined(PADDLE_WITH_GLOO)
+using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
+using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
+using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
+#endif
+
+static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";  // NOLINT
+
 void BindDistributed(py::module *m) {
   py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
       .value("SUM", distributed::ReduceOp::SUM)
@@ -129,6 +142,7 @@ void BindDistributed(py::module *m) {
       *m, "ProcessGroupNCCL", ProcessGroup)
       .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
            py::call_guard<py::gil_scoped_release>());
+#endif
 
   py::class_<distributed::ProcessGroup::Task,
              std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
@@ -138,7 +152,6 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
            py::call_guard<py::gil_scoped_release>());
-#endif
 
   // define parallel strategy, it will be removed
   py::class_<distributed::ProcessGroupStrategy> pg_strategy(
@@ -178,6 +191,45 @@ void BindDistributed(py::module *m) {
                       self.nrings_ = nrings;
                     });
 
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<GlooOptions>(*m, "GlooOptions")
+      .def(py::init<>())
+      .def_readwrite("_device", &GlooOptions::device)
+      .def_static("create", &GlooOptions::create);
+
+  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
+      .def(py::init(
+               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
+                 return std::make_shared<GlooStore>(store);
+               }),
+           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
+      *m, "ProcessGroupGloo", ProcessGroup)
+      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
+                    std::shared_ptr<GlooOptions> &>(),
+           py::call_guard<py::gil_scoped_release>())
+      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
+                       int world_size) {
+             auto opts = GlooOptions::create();
+             char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
+             if (ifname && strlen(ifname) > 1) {
+               opts->device = ProcessGroupGloo::createDeviceForInterface(
+                   std::string(ifname));
+             } else {
+               opts->device = ProcessGroupGloo::createDefaultDevice();
+             }
+             return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
+                                                       opts);
+           }),
+           py::arg("store"), py::arg("rank"),
+           py::arg("world_size"),  // py::arg("timeout") =
+                                   // kProcessGroupDefaultTimeout,
+           py::call_guard<py::gil_scoped_release>())
+      .def_static("create_default_device",
+                  &ProcessGroupGloo::createDefaultDevice);
+#endif
+
   m->def("eager_assign_group_by_size",
          [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
             std::vector<size_t> group_size_limits,
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
new file mode 100644
index 00000000000..5420e1d36b3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_gloo(self):
+        with _test_eager_guard():
+            nranks = ParallelEnv().nranks
+            rank = ParallelEnv().local_rank
+            is_master = True if rank == 0 else False
+            store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
+                                               nranks, datetime.timedelta(0))
+            gloo_store = paddle.fluid.core.GlooStore(store)
+            opt = paddle.fluid.core.GlooOptions()
+            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+
+            # test allreduce sum
+            # rank 0
+            paddle.device.set_device('cpu')
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = x + y
+            if rank == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if rank == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if rank == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+            print("test broadcast api ok")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index 6ae5424a882..58baa0a2fa9 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_nccl.py')
 
+    def test_process_group_gloo(self):
+        self.run_mnist_2gpu('process_group_gloo.py')
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From ebd0f51287ad3ea0c8d91ee899b9edfcbc351c8e Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 3 Mar 2022 09:32:42 +0800
Subject: [PATCH 074/272] Move bn to pten (#39347)

* add bn cpu version; test=develop

* move batch norm to pten

* move batch norm to pten; test=develop

* fix bug; test=develop

* fix func::tranpose depend bug; test=develop

* fix compile bugs; test=develop

* fix use_op batch_norm bug; test=develop

* fix cudnn bn add relu test; test=develop

* fix pten context build and double grad bug; test= develop

* remve useless code; test=develop

* add batch norm gpu fp16 support; test=develop

* fix test bn op bug; test=develop

* remove output dtype set; test=develop

* fix bug; test=develop

* fix bug; test=develop

* fix applay pass to program bug; test=develop

* revert to develop; test=develop

* fix rocm bug; test=develop

* revert operator to develop; test=develop

* fix pre_commit; test=develop

* fix statci check error; test=develop

* resolve conflict; test=develop

* ana batch norm bug;

* revert batch norm op

* resolve conlict

* fix nan inf and speed bug; test=develop

* fix bug; test=develop

* fix error; test=develop

* test expand op; test=develop

* fix bug; test=develop

* resolve confilct

* resolve confilct; test=develop

* polish code; test=develop

* polish code; test=develop

* change mutable data to ctx alloc; test=develop

* make format same with ci; test=develop

* fix format error with ci; test=develop
---
 .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc |    9 +-
 paddle/fluid/framework/operator.cc            |    2 -
 paddle/fluid/operators/batch_norm_op.cc       |   12 -
 paddle/fluid/operators/batch_norm_op.cu       | 1322 -----------------
 .../operators/fused/cudnn_bn_add_relu_test.cc |    2 +-
 paddle/fluid/operators/inplace_abn_op.cc      |   83 +-
 paddle/fluid/operators/inplace_abn_op.cu      |   81 +-
 paddle/fluid/operators/norm_utils.cu.h        |   47 +-
 paddle/phi/kernels/batch_norm_grad_kernel.h   |   90 ++
 paddle/phi/kernels/batch_norm_kernel.h        |   43 +
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |  674 +++++++++
 paddle/phi/kernels/cpu/batch_norm_kernel.cc   |  204 +++
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 1038 +++++++++++++
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |  680 +++++++++
 paddle/phi/kernels/gpu/batch_norm_utils.h     |  142 ++
 paddle/phi/ops/compat/batch_norm_sig.cc       |   89 ++
 .../dygraph_to_static/test_mobile_net.py      |    1 +
 .../unittests/test_apply_pass_to_program.py   |    1 +
 .../tests/unittests/test_batch_norm_op.py     |   16 +-
 .../tests/unittests/test_batch_norm_op_v2.py  |   14 +-
 .../fluid/tests/unittests/test_conv2d_op.py   |    2 +
 .../tests/unittests/test_expand_v2_op.py      |    1 +
 .../tests/unittests/test_inplace_abn_op.py    |    9 +-
 .../tests/unittests/test_norm_nn_grad.py      |    2 +
 .../unittests/test_program_prune_backward.py  |    2 +
 .../fluid/tests/unittests/test_reshape_op.py  |    1 +
 26 files changed, 3175 insertions(+), 1392 deletions(-)
 create mode 100644 paddle/phi/kernels/batch_norm_grad_kernel.h
 create mode 100644 paddle/phi/kernels/batch_norm_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/batch_norm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/batch_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/batch_norm_utils.h
 create mode 100644 paddle/phi/ops/compat/batch_norm_sig.cc

diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 96aa95bde33..11190309814 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
 #include <string>
+#include <unordered_set>
 
-#include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>
-#include <random>
-#include <unordered_set>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
@@ -25,7 +26,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6414dd455db..8ebc64e5f2c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2215,8 +2215,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
-
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
         const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 949cf021cf0..174207deb08 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
                   ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
                   ops::BatchNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index d59396db151..a19b087245a 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-template <typename T, framework::DataLayout layout>
-static __global__ void BNForwardInference(
-    const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int num = N * C * HxW;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> x_sub_mean =
-        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
-    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
-    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
-    const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, double exponentialAverageFactor, T *y,
-    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      x_sum += x_i;
-      x_square_sum += x_i * x_i;
-    }
-    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-    x_square_sum =
-        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      mean_val = x_sum / inner_size;
-      variance_val = x_square_sum / inner_size - mean_val * mean_val;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    bool test_mode = is_test && (!trainable_stats);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5"
-            "But received: the size of input's dimensions is [%d]",
-            x_dims.size()));
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        test_mode ||
-        (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
-
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_y(y->type());
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
-                                                           &transformed_y);
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-    } else if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#else
-    if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(
-//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_,
-        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    // Now, depending on whether we are running test or not, we have two paths.
-    // It is training mode when it's not reference AND not using pre-trained
-    // model.
-    bool training = !test_mode && !use_global_stats;
-    if (!training) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of mean's dimensions must equal to 1."
-              "But received: the size of mean's dimensions mean is [%d],"
-              "the dimensions of mean is [%s].",
-              est_mean->dims().size(), est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of variance's dimensions must equal to 1."
-              "But received: the size of variance's dimensions is [%d],"
-              "the dimensions of variance is [%s].",
-              est_var->dims().size(), est_var->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of mean must equal to the number of "
-              "Channels, which is [%d]. But received: the first dimension"
-              "of mean is [%d], the dimensions of mean is [%s].",
-              C, est_mean->dims()[0], est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of variance must equal to the number"
-              "of Channels, which is [%d]. But received: the first dimension of"
-              "variance is [%d], the dimensions of variance is [%s].",
-              C, est_var->dims()[0], est_var->dims()));
-
-#ifdef PADDLE_WITH_HIP
-      const int block_size = 256;
-      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
-      if (compute_format == DataLayout::kNCHW) {
-        BNForwardInference<
-            T,
-            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      } else {
-        BNForwardInference<
-            T,
-            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardInference(
-//         handle, miopenBNSpatial,
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_mean->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_var->template data<BatchNormParamType<T>>())),
-//         epsilon));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationForwardInference(
-              handle,
-              // Note: PERSISTENT not implemented for inference
-              CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_,
-              transformed_x.template data<T>(), data_desc_,
-              transformed_y.template mutable_data<T>(ctx.GetPlace()),
-              bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-              bias->template data<BatchNormParamType<T>>(),
-              est_mean->template data<BatchNormParamType<T>>(),
-              est_var->template data<BatchNormParamType<T>>(), epsilon));
-#endif
-    } else {
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        Tensor mom_cpu;
-        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
-                                          &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      // Run training mode.
-      // obtain running mean and running inv var, and there is no need
-      // to initialize them.
-
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-      } else {
-        double this_factor = 1. - momentum;
-
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        size_t reserve_space_size = 0;
-        void *reserve_space_ptr = nullptr;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        // Create reserve space and workspace for batch norm.
-        // Create tensor for each batchnorm op, it will be used in the
-        // backward. Thus this tensor shouldn't be temp.
-        auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
-        PADDLE_ENFORCE_NOT_NULL(
-            reserve_space,
-            platform::errors::NotFound(
-                "The argument ReserveSpace of batch_norm op is not found."));
-
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*zDesc=*/nullptr,
-                    /*yDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*activationDesc=*/nullptr,
-                    /*xDesc=*/data_desc_,
-                    /*sizeInBytes=*/&reserve_space_size));
-
-        reserve_space_ptr = reserve_space->mutable_data(
-            ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(), data_desc_,
-                transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
-                transformed_y.template data<T>(), bn_param_desc_,
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
-                reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          const int num = transformed_x.numel();
-          const int block = 256;
-          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-          const int max_blocks = std::max(max_threads / block, 1);
-          const int grid = std::min(C, max_blocks);
-          if (compute_format == DataLayout::kNCHW) {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          } else {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardTraining(
-//         handle, mode_, const_cast<void *>(static_cast<const void *>(
-//                            CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         this_factor,
-//         static_cast<void *>(
-//             mean_out->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(variance_out->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace())),
-//         epsilon,
-//         static_cast<void *>(
-//             saved_mean->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(saved_variance->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationForwardTraining(
-                  handle, mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_y.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  bias->template data<BatchNormParamType<T>>(), this_factor,
-                  mean_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  variance_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon,
-                  saved_mean->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  saved_variance->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace())));
-#endif
-        }
-      }
-    }
-
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_y, y);
-    }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const double epsilon, const int N,
-    const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
-    BatchNormParamType<T> mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
-                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_sum * inv_var_i;
-      dbias[i] = db_sum;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
-template <typename T>
-static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
-                                       const BatchNormParamType<T> *scale,
-                                       const BatchNormParamType<T> *bias,
-                                       const BatchNormParamType<T> *mean,
-                                       const BatchNormParamType<T> *variance,
-                                       double epsilon, int C, int M,
-                                       const int num, const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
-    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
-    x[i] = static_cast<T>(x_i);
-  }
-}
-
-template <typename T>
-class InplaceHelper {
- public:
-  void operator()(const framework::DataLayout layout, T *x,
-                  const BatchNormParamType<T> *scale,
-                  const BatchNormParamType<T> *bias,
-                  const BatchNormParamType<T> *mean,
-                  const BatchNormParamType<T> *variance, double epsilon, int C,
-                  int M, const int num, const T *y, int grid2, const int block,
-                  const gpuStream_t &stream) {
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y should be inplaced in inplace mode"));
-    KeBNRestoreData<<<grid2, block, 0, stream>>>(
-        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
-    const T *dy, const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *saved_mean,
-    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
-    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> inv_var_val;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> dscale_val;
-  __shared__ BatchNormParamType<T> dbias_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    if (saved_mean && saved_inv_variance) {
-      if (threadIdx.x == 0) {
-        inv_var_val = saved_inv_variance[i];
-        mean_val = saved_mean[i];
-      }
-    } else {
-      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-      BatchNormParamType<T> x_square_sum =
-          static_cast<BatchNormParamType<T>>(0);
-
-      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-        const int index = layout == framework::DataLayout::kNCHW
-                              ? (j / HxW * C + i) * HxW + j % HxW
-                              : j * outer_size + i;
-        BatchNormParamType<T> x_i =
-            static_cast<BatchNormParamType<T>>(x[index]);
-        x_sum += x_i;
-        x_square_sum += x_i * x_i;
-      }
-      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-      x_square_sum =
-          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-      if (threadIdx.x == 0) {
-        mean_val = x_sum / inner_size;
-        inv_var_val =
-            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
-      }
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      ds_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
-      db_sum += dy_i;
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale_val = ds_sum * inv_var_val;
-      dbias_val = db_sum;
-      dscale[i] = dscale_val;
-      dbias[i] = dbias_val;
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] = scale[i] * inv_var_val *
-                  (static_cast<BatchNormParamType<T>>(dy[index]) -
-                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
-                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
-                       inv_var_val * dscale_val / inner_size);
-    }
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
-    const T *dy, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *mean, const T *x,
-    const BatchNormParamType<T> *variance, const int C, const int N,
-    const int HxW, T *dx) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> inv_var_i = variance[i];
-    BatchNormParamType<T> mean_i = mean[i];
-    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> dy_x_sub_mean_sum =
-        static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      dy_sum += dy_i;
-      dy_x_sub_mean_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-    }
-
-    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
-                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dy_sum_val = dy_sum;
-      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] =
-          (static_cast<BatchNormParamType<T>>(dy[index]) -
-           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
-           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
-               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
-          scale[i] * inv_var_i;
-    }
-  }
-}
-
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    const bool is_test = ctx.Attr<bool>("is_test");
-    use_global_stats = is_test || use_global_stats;
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5."
-            "But received: the size of input's dimensions is [%d],"
-            "the dimensions of input is [%s]",
-            x_dims.size(), x_dims));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(
-        scale->dims().size(), 1UL,
-        platform::errors::InvalidArgument(
-            "The size of scale's dimensions must equal to 1. But received: "
-            "the size of scale's dimensions is [%d], the dimensions of scale "
-            "is [%s].",
-            scale->dims().size(), scale->dims()));
-    PADDLE_ENFORCE_EQ(
-        scale->dims()[0], C,
-        platform::errors::InvalidArgument(
-            "The first dimension of scale must equal to Channels[%d]. But "
-            "received: the first dimension of scale is [%d]",
-            C, scale->dims()[0]));
-
-    auto dtype = platform::CudnnDataType<T>::type;
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
-        reserve_space != nullptr;
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_d_y(d_y->type());
-    Tensor transformed_d_x;
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                           &transformed_d_y);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                          &transformed_d_y);
-      if (d_x) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
-                                                             &transformed_d_x);
-      }
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_d_y.ShareDataWith(*d_y);
-      if (d_x) {
-        transformed_d_x.ShareDataWith(*d_x);
-      }
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int num = transformed_x.numel();
-#ifdef HIPCC
-    const int block = 256;
-#else
-    const int block = 512;
-#endif
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-    auto stream = dev_ctx.stream();
-    InplaceHelper<T> inplace_functor;
-
-    if (!use_global_stats) {
-      if ((N * H * W * D) == 1) {
-        if (d_x) {
-          framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-        }
-        phi::funcs::SetConstant<platform::CUDADeviceContext,
-                                BatchNormParamType<T>>
-            functor;
-        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-        return;
-      }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-      cudnnTensorDescriptor_t data_desc_;
-      cudnnTensorDescriptor_t bn_param_desc_;
-      cudnnBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-        LOG(ERROR) << "Provided epsilon is smaller than "
-                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                   << "CUDNN_BN_MIN_EPSILON instead.";
-      }
-      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-      } else if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#else
-      if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-//                                                       data_desc_, mode_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                           data_desc_, mode_));
-#endif
-
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-      const auto *saved_mean_data =
-          saved_mean->template data<BatchNormParamType<T>>();
-      const auto *saved_var_data =
-          saved_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        inplace_functor(compute_format, transformed_x.data<T>(),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        saved_mean_data, saved_var_data, epsilon, C, H * W * D,
-                        num, transformed_x.data<T>(), grid2, block, stream);
-      }
-
-      // This branch calls CUDNN APIs
-      if (d_x && d_scale && d_bias) {
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        auto reserve_space_size = reserve_space->memory_size();
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-                    /*handle=*/dev_ctx.cudnn_handle(),
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*yDesc=*/data_desc_,
-                    /*dyDesc=*/data_desc_,
-                    /*dzDesc=*/nullptr,
-                    /*dxDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationBackwardEx(
-                /*handle=*/dev_ctx.cudnn_handle(),
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-                /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-                /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-                /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-                /*xDesc=*/data_desc_,
-                /*xData=*/transformed_x.template data<T>(),
-                /*yDesc=*/nullptr,
-                /*yData=*/nullptr,
-                /*dyDesc=*/data_desc_,
-                /*dyData=*/transformed_d_y.template data<T>(),
-                /*dzDesc=*/nullptr,
-                /*dzData=*/nullptr,
-                /*dxDesc=*/data_desc_,
-                /*dxData=*/transformed_d_x.template mutable_data<T>(
-                    ctx.GetPlace()),
-                /*dBnScaleBiasDesc=*/bn_param_desc_,
-                /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
-                /*bnBiasData=*/nullptr,
-                /*dBnScaleData=*/d_scale
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*dBnBiasData=*/d_bias
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*epsilon=*/epsilon,
-                /*savedMean=*/saved_mean_data,
-                /*savedInvVariance=*/saved_var_data,
-                /*activationDesc=*/nullptr,
-                /*workspace=*/workspace_ptr,
-                /*workSpaceSizeInBytes=*/workspace_size,
-                /*reserveSpace=*/const_cast<T *>(
-                    reserve_space->template data<T>()),
-                /*reserveSpaceSizeInBytes=*/reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          if (compute_format == DataLayout::kNCHW) {
-            BNBackward<
-                T, block,
-                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          } else {
-            BNBackward<
-                T, block,
-                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationBackward(
-//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), data_desc_,
-//         transformed_x.template data<T>(), data_desc_,
-//         transformed_d_y.template data<T>(), data_desc_,
-//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-//         d_scale->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         d_bias->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         epsilon, saved_mean_data, saved_var_data));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
-#endif
-        }
-
-        if (data_layout == DataLayout::kNHWC &&
-            compute_format == DataLayout::kNCHW) {
-          VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-          TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-              ctx, &transformed_d_x, d_x);
-        }
-      } else {
-        // This branch call CUDA kernels
-        if (compute_format == DataLayout::kNCHW) {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        } else {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        }
-      }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-      // clean when exit.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_var = ctx.Input<Tensor>("Variance");
-
-      const auto *running_mean_data =
-          running_mean->template data<BatchNormParamType<T>>();
-      const auto *running_var_data =
-          running_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        auto px = *x;
-        inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        running_mean_data, running_var_data, epsilon, C,
-                        H * W * D, num, x->data<T>(), grid2, block, stream);
-      }
-
-      if (compute_format == DataLayout::kNCHW) {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      } else {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
-        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
-        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
-#endif
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 6119af18ce1..b3ac3606eaf 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index e0779249c41..7f513696998 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
-    : public paddle::operators::BatchNormKernel<DeviceContext, T> {
+class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
@@ -213,7 +214,33 @@ class InplaceABNKernel
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    BatchNormKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+        is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+        mean_out, variance_out, saved_mean, saved_variance, reserve_space);
 
     auto cur_y = EigenVector<T>::Flatten(*y);
     InplaceABNActivation<DeviceContext, T> functor;
@@ -222,8 +249,7 @@ class InplaceABNKernel
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
-    : public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Input<Tensor>("Y");
@@ -244,7 +270,52 @@ class InplaceABNGradKernel
     InplaceABNActivation<DeviceContext, T> functor;
     functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
 
-    BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+    paddle::optional<const Tensor&> space_opt = paddle::none;
+    paddle::optional<const Tensor&> mean_opt = paddle::none;
+    paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+    if (reserve_space != nullptr) {
+      space_opt = *reserve_space;
+    }
+
+    if (mean != nullptr) {
+      mean_opt = *mean;
+    }
+
+    if (variance != nullptr) {
+      variance_opt = *variance;
+    }
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormGradRawKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+        mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+        use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+        scale_grad, bias_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index be7a7bd7171..db8f8c72d13 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -15,14 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/inplace_abn_op.h"
 #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
@@ -36,7 +37,33 @@ class InplaceABNKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* mean = ctx.Input<Tensor>("Mean");
+      auto* variance = ctx.Input<Tensor>("Variance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* mean_out = ctx.Output<Tensor>("MeanOut");
+      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+          is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+          mean_out, variance_out, saved_mean, saved_variance, reserve_space);
     }
 
     auto cur_y = EigenVector<T>::Flatten(*y);
@@ -49,8 +76,7 @@ class InplaceABNKernel
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
 class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* y = ctx.Input<Tensor>("Y");
@@ -74,7 +100,50 @@ class InplaceABNGradKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+      auto* mean = ctx.Input<Tensor>("ReserveSpace");
+      auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+      paddle::optional<const Tensor&> space_opt = paddle::none;
+      paddle::optional<const Tensor&> mean_opt = paddle::none;
+      paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+      if (reserve_space != nullptr) {
+        space_opt = *reserve_space;
+      }
+
+      if (mean != nullptr) {
+        mean_opt = *mean;
+      }
+
+      if (variance != nullptr) {
+        variance_opt = *variance;
+      }
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormGradRawKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+          mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+          use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+          scale_grad, bias_grad);
     }
   }
 };
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index c400a8f4239..0ed1f2719de 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
 }
 
 template <typename DeviceContext, typename T>
-void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+void NormDoubleGradFunctor(const DeviceContext &ctx,
                            const DataLayout data_layout, const Tensor *X,
                            const Tensor *Scale, const Tensor *dY,
                            const Tensor *Saved_mean,
-                           const Tensor *Saved_variance, const double epsilon,
+                           const Tensor *Saved_variance, const Tensor *Mean,
+                           const Tensor *Variance, const double epsilon,
                            const bool use_global_stats, const Tensor *ddX,
                            const Tensor *ddScale, const Tensor *ddBias,
                            Tensor *dX, Tensor *dScale, Tensor *ddY) {
@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
   const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
 
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  phi::funcs::SetConstant<DeviceContext, T> set_constant;
 
   auto &x_dims = X->dims();
   const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   Tensor scale_tmp;
   if (!Scale) {
     scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
 #ifdef __HIPCC__
@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 #else
   const int block = 512;
 #endif
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
   int grid1 = (num + block - 1) / block;
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean = Mean;
+    const auto *running_var = Variance;
     const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
     mean_data = running_mean_data;
@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   } else {
     const T *smean_data = Saved_mean->data<T>();
     const T *svariance_data = Saved_variance->data<T>();
+
     mean_data = smean_data;
     variance_data = svariance_data;
   }
 
   if (dX) {
     T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dX, static_cast<T>(0));
+    set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       } else {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       }
@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (dScale) {
     T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       } else {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       } else {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       }
@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (ddY) {
     T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       } else {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       }
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
new file mode 100644
index 00000000000..c15dbd2f63f
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& dev_ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h
new file mode 100644
index 00000000000..7ddf32e27c7
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_kernel.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
new file mode 100644
index 00000000000..de2343a384a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -0,0 +1,674 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad) {
+  const auto* d_y = &y_grad;
+
+  DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  // batch_norm with inplace as false will take X as grad input, which
+  // is same as cuDNN batch_norm backward calculation, batch_norm
+  // with inplace as true only take Y as input and X should be calculate
+  // by inverse operation of batch_norm on Y
+
+  if (is_inplace) {
+    if (d_x) {
+      PADDLE_ENFORCE_EQ(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  } else {
+    if (d_x) {
+      PADDLE_ENFORCE_NE(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  }
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded as
+  // NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  T* d_bias_data = nullptr;
+  T* d_scale_data = nullptr;
+  if (d_scale && d_bias) {
+    d_bias_data = ctx.template Alloc<T>(d_bias);
+    d_scale_data = ctx.template Alloc<T>(d_scale);
+  }
+
+  // d_bias = np.sum(d_y, axis=0)
+  // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+  //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+  EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+  EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
+
+  if (d_scale && d_bias) {
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+  }
+
+  if (d_x && (N * sample_size) == 1 && !use_global_stats) {
+    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    return;
+  }
+
+  int scale_coefff = use_global_stats ? 1 : N * sample_size;
+  const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
+
+  DenseTensor dy_sum;
+  dy_sum.Resize({C});
+  auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
+  EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
+
+  DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
+  dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
+  auto dy_mul_x_sub_mean_mul_invstd_sum_data =
+      ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
+  EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
+      dy_mul_x_sub_mean_mul_invstd_sum_data, C);
+
+  dy_sum_arr.setZero();
+  dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
+
+  // inplace calculation
+  // Y:  ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  // X: (y - bias) / scale / (inv_var) + est_mean
+  //   formula transform ====>
+  //    (y - bias) / (scale * inv_var) + est_mean
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
+                               scale_inv_var_nhw(nc % C) / scale_coefff +
+                           mean_arr(nc % C);
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dy_sum_arr(c) += d_y_arr.col(nc).sum();
+        dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
+            ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                .sum();
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), sample_size, N * C);
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) =
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
+        }
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
+        for (int nhw = 0; nhw < N * sample_size; nhw++) {
+          x_data.col(nhw) =
+              (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
+              mean_arr;
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+
+      for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+        dy_sum_arr += d_y_arr.col(nhw);
+        dy_mul_x_sub_mean_mul_invstd_sum_arr +=
+            (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), C, N * sample_size);
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) =
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
+                 (x_arr.col(nhw) - mean_arr) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
+        }
+      }
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                data_layout_str));
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad) {
+  const auto* X = &x;
+  const auto* Scale = &scale;
+  const auto* dY = &y_grad;
+  const auto* Saved_mean = &saved_mean;
+  const auto* Saved_variance = &saved_variance;
+
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const auto data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto* ddX = &x_grad_grad;
+  const auto* ddScale = &scale_grad_grad;
+  const auto* ddBias = &bias_grad_grad;
+
+  auto* dX = x_grad;
+  auto* dScale = scale_grad;
+  auto* ddY = y_grad_grad;
+  ctx.template Alloc<T>(dX);
+  ctx.template Alloc<T>(ddY);
+
+  const auto& x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = X->numel() / C;
+  phi::funcs::SetConstant<Context, T> set_constant;
+
+  const T* mean_data = Saved_mean->data<T>();
+  const T* inv_var_data = Saved_variance->data<T>();
+
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  // transpose NCHW -> NHWC for easy calculate
+  DenseTensor transformed_x(X->type());
+  DenseTensor transformed_dy(dY->type());
+  DenseTensor transformed_ddx(ddX->type());
+
+  DenseTensor transformed_dx(dX->type());
+  DenseTensor transformed_ddy(ddY->type());
+  if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    // Input Tensor
+    ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
+    TransToChannelLast<Context, T>(ctx, X, &transformed_x);
+    ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    // Output Tensor
+    ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
+    ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
+  } else {
+    transformed_x.ShareDataWith(*X);
+    transformed_dy.ShareDataWith(*dY);
+    transformed_ddx.ShareDataWith(*ddX);
+
+    transformed_dx.ShareDataWith(*dX);
+    transformed_ddy.ShareDataWith(*ddY);
+  }
+
+  ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  Tensor mean_tile;
+  mean_tile.Resize({C, sample_size});
+  EigenArrayMap<T> mean_tile_data(
+      ctx.template Alloc<T>(&mean_tile), C, sample_size);
+
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({C, sample_size});
+  EigenArrayMap<T> inv_var_tile_data(
+      ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
+
+  mean_tile_data = mean_arr.replicate(1, sample_size);
+  inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    ctx.template Alloc<T>(&Scale_data);
+    set_constant(ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  Tensor scale_tile;
+  scale_tile.Resize({C, sample_size});
+  EigenArrayMap<T> scale_tile_data(
+      ctx.template Alloc<T>(&scale_tile), C, sample_size);
+  scale_tile_data = scale_arr.replicate(1, sample_size);
+
+  ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+  ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({C, sample_size});
+
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dX) {
+    ctx.template Alloc<T>(dX);
+    EigenArrayMap<T> dx_arr(
+        ctx.template Alloc<T>(&transformed_dx), C, sample_size);
+    dx_arr.setZero();
+    if (use_global_stats) {
+      // math: dx = (ddscale * dy) * inv_var
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+      }
+    } else {
+      // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+      // axis=(n,h,w)) *
+      //          np.sum(dy, axis=(n,h,w)) -
+      //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+      //          mean),
+      //          axis=(n,h,w)) * inv_var.pow(2) *
+      //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+      //          NxHxW *
+      //          np.sum(ddx * (x - mean)) *
+      //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+      //          np.sum(dy,
+      //          axis=(n,h,w)) * (x - mean) *
+      //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
+      //          inv_var
+      //          *
+      //          np.mean(dy, axis=(n,h,w)) -
+      //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+      //          axis=(n,h,w)))
+
+      if (ddX) {
+        dx_arr +=
+            (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+             sample_size)
+                .colwise() *
+            (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+             (dy_arr * ddx_arr).rowwise().sum() +
+             3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                 sample_size);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size *
+                  (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+        dx_arr = scale_tile_data * dx_arr;
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr +=
+            (dy_arr * inv_var_tile_data -
+             (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
+                 inv_var_tile_data -
+             x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size) *
+            ddscale_tile_data;
+      }
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
+    }
+  }
+  if (dScale) {
+    EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
+    dscale_arr.setZero();
+    if (use_global_stats) {
+      // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+      if (ddX) {
+        dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+      }
+    } else {
+      // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+      //            ddx
+      if (ddX) {
+        Tensor first_grad;
+        first_grad.Resize({C, sample_size});
+        EigenArrayMap<T> first_grad_arr(
+            ctx.template Alloc<T>(&first_grad), C, sample_size);
+        first_grad_arr.setZero();
+
+        first_grad_arr +=
+            inv_var_tile_data *
+            (dy_arr -
+             dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+        dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+      }
+    }
+  }
+
+  if (ddY) {
+    ctx.template Alloc<T>(ddY);
+    EigenArrayMap<T> ddy_arr(
+        ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
+    ddy_arr.setZero();
+    if (use_global_stats) {
+      // math: ddy = r * ddx * inv_var + ddbias +
+      //           ddscale * (x - mean) * inv_var
+      if (ddX) {
+        ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+      }
+    } else {
+      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+      //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+      if (ddX) {
+        ddy_arr +=
+            scale_tile_data * inv_var_tile_data *
+            (ddx_arr -
+             ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+      }
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      Tensor ddscale_tile;
+      ddscale_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddscale_tile_data(
+          ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+      ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+    }
+
+    if (ddBias) {
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      Tensor ddbias_tile;
+      ddbias_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddbias_tile_data(
+          ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
+      ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+      ddy_arr += ddbias_tile_data;
+    }
+
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
new file mode 100644
index 00000000000..743128e8dea
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space) {
+  bool test_mode = is_test && (!trainable_statistics);
+
+  bool global_stats = test_mode || use_global_stats;
+
+  auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensionss is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // alloc memory
+  ctx.template Alloc<T>(y);
+  ctx.template Alloc<T>(mean_out);
+  ctx.template Alloc<T>(variance_out);
+  ctx.template Alloc<T>(saved_mean);
+  ctx.template Alloc<T>(saved_variance);
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded
+  // as NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  if (!global_stats) {
+    // saved_xx is use just in this batch of data
+    EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
+    EigenVectorArrayMap<T> saved_variance_e(
+        ctx.template Alloc<T>(saved_variance), C);
+    saved_mean_e.setZero();
+    saved_variance_e.setZero();
+
+    EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
+    EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
+                                           C);
+
+    if ((N * sample_size) == 1) {
+      // Only 1 element in normalization dimension,
+      // we skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      return;
+    }
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_mean_e(nc % C) += x_arr.col(nc).sum();
+        }
+        saved_mean_e /= N * sample_size;
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_variance_e(nc % C) +=
+              (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_mean_e += x_arr.col(i);
+        }
+        saved_mean_e /= N * sample_size;
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_variance_e +=
+              (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                  data_layout_str));
+    }
+
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    running_mean_arr =
+        running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+    running_var_arr =
+        running_var_arr * momentum + saved_variance_e * (1. - momentum);
+  }
+
+  // use SavedMean and SavedVariance to do normalize
+  Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+  if (global_stats) {
+    ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
+    inv_std = (var_arr + epsilon).sqrt().inverse();
+  } else {
+    EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
+    // inverse SavedVariance first, gradient will use it too.
+    saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+    inv_std = saved_inv_std;
+  }
+  ConstEigenVectorArrayMap<T> mean_arr(
+      global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
+
+  //   ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+  Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+      bias_arr - mean_arr * inv_std * scale_arr;
+
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
+          (ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
+           new_scale)
+              .colwise() +
+          new_bias;
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
+                                                data_layout));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
new file mode 100644
index 00000000000..2c9ee5ede01
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -0,0 +1,1038 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon,
+                                        const int C,
+                                        const int HxW,
+                                        const int num,
+                                        T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T>
+static __global__ void KeBNRestoreData(const phi::DataLayout layout,
+                                       T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *variance,
+                                       double epsilon,
+                                       int C,
+                                       int M,
+                                       const int num,
+                                       const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+
+template <typename T>
+class InplaceHelper {
+ public:
+  void operator()(const phi::DataLayout layout,
+                  T *x,
+                  const BatchNormParamType<T> *scale,
+                  const BatchNormParamType<T> *bias,
+                  const BatchNormParamType<T> *mean,
+                  const BatchNormParamType<T> *variance,
+                  double epsilon,
+                  int C,
+                  int M,
+                  const int num,
+                  const T *y,
+                  int grid2,
+                  const int block,
+                  const gpuStream_t &stream) {
+    PADDLE_ENFORCE_EQ(x,
+                      y,
+                      phi::errors::InvalidArgument(
+                          "X and Y should be inplaced in inplace mode"));
+    KeBNRestoreData<<<grid2, block, 0, stream>>>(
+        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
+  }
+};
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    T *dx,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == phi::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean,
+    const T *x,
+    const BatchNormParamType<T> *variance,
+    const int C,
+    const int N,
+    const int HxW,
+    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> inv_var_i = variance[i];
+    BatchNormParamType<T> mean_i = mean[i];
+    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> dy_x_sub_mean_sum =
+        static_cast<BatchNormParamType<T>>(0);
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      dy_sum += dy_i;
+      dy_x_sub_mean_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+    }
+
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
+                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] =
+          (static_cast<BatchNormParamType<T>>(dy[index]) -
+           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
+           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
+               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
+          scale[i] * inv_var_i;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context &ctx,
+                            const DenseTensor &y_grad,
+                            const DenseTensor &x,
+                            const DenseTensor &scale,
+                            const DenseTensor &bias,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            paddle::optional<const DenseTensor &> reserve_space,
+                            paddle::optional<const DenseTensor &> mean,
+                            paddle::optional<const DenseTensor &> variance,
+                            float momentum,
+                            float epsilon_f,
+                            const std::string &data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor *x_grad,
+                            DenseTensor *scale_grad,
+                            DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto *d_y = &y_grad;
+
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  const auto &x_dims = x.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5."
+          "But received: the size of input's dimensions is [%d],"
+          "the dimensions of input is [%s]",
+          x_dims.size(),
+          x_dims));
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  if (d_scale && d_bias) {
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_EQ(
+      scale.dims().size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "The size of scale's dimensions must equal to 1. But received: "
+          "the size of scale's dimensions is [%d], the dimensions of scale "
+          "is [%s].",
+          scale.dims().size(),
+          scale.dims()));
+  PADDLE_ENFORCE_EQ(
+      scale.dims()[0],
+      C,
+      phi::errors::InvalidArgument(
+          "The first dimension of scale must equal to Channels[%d]. But "
+          "received: the first dimension of scale is [%d]",
+          C,
+          scale.dims()[0]));
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
+                                    FLAGS_cudnn_batchnorm_spatial_persistent &&
+                                    (reserve_space.get_ptr() != nullptr);
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_d_y(d_y->type());
+  DenseTensor transformed_d_x;
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    if (d_x) {
+      ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
+    }
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_d_y.ShareDataWith(*d_y);
+    if (d_x) {
+      transformed_d_x.ShareDataWith(*d_x);
+    }
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  }
+
+  const int num = transformed_x.numel();
+#ifdef HIPCC
+  const int block = 256;
+#else
+  const int block = 512;
+#endif
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid1 = (num + block - 1) / block;
+  int grid2 = std::min(C, max_blocks);
+  auto stream = ctx.stream();
+  InplaceHelper<T> inplace_functor;
+
+  if (!use_global_stats) {
+    if ((N * H * W * D) == 1) {
+      if (d_x) {
+        paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+      }
+      phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
+      functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(
+            &bn_param_desc_));
+#endif
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    } else if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#else
+    if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_,
+            CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4,
+            dims.data(),
+            strides.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+            bn_param_desc_, data_desc_, mode_));
+#endif
+
+    const auto *saved_mean_data =
+        saved_mean.template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_variance.template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      inplace_functor(compute_format,
+                      transformed_x.data<T>(),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      saved_mean_data,
+                      saved_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      transformed_x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    // This branch calls CUDNN APIs
+    if (d_x && d_scale && d_bias) {
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      auto reserve_space_size = reserve_space->memory_size();
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+                  /*handle=*/ctx.cudnn_handle(),
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*yDesc=*/data_desc_,
+                  /*dyDesc=*/data_desc_,
+                  /*dzDesc=*/nullptr,
+                  /*dxDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
+              /*handle=*/ctx.cudnn_handle(),
+              /*mode=*/mode_,
+              /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+              /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+              /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+              /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+              /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+              /*xDesc=*/data_desc_,
+              /*xData=*/transformed_x.template data<T>(),
+              /*yDesc=*/nullptr,
+              /*yData=*/nullptr,
+              /*dyDesc=*/data_desc_,
+              /*dyData=*/transformed_d_y.template data<T>(),
+              /*dzDesc=*/nullptr,
+              /*dzData=*/nullptr,
+              /*dxDesc=*/data_desc_,
+              /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
+              /*dBnScaleBiasDesc=*/bn_param_desc_,
+              /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
+              /*bnBiasData=*/nullptr,
+              /*dBnScaleData=*/d_scale
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*dBnBiasData=*/d_bias
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*epsilon=*/epsilon,
+              /*savedMean=*/saved_mean_data,
+              /*savedInvVariance=*/saved_var_data,
+              /*activationDesc=*/nullptr,
+              /*workspace=*/workspace_ptr,
+              /*workSpaceSizeInBytes=*/workspace_size,
+              /*reserveSpace=*/const_cast<T *>(
+                  reserve_space->template data<T>()),
+              /*reserveSpaceSizeInBytes=*/reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        if (compute_format == DataLayout::kNCHW) {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        } else {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationBackward(
+                ctx.cudnn_handle(),
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                transformed_d_y.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_d_x),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean_data,
+                saved_var_data));
+#endif
+      }
+
+      if (data_layout == DataLayout::kNHWC &&
+          compute_format == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+        TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
+      }
+    } else {
+      // This branch call CUDA kernels
+      if (compute_format == DataLayout::kNCHW) {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+    // clean when exit.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
+            bn_param_desc_));
+#endif
+  } else {
+    const auto *running_mean = mean.get_ptr();
+    const auto *running_var = variance.get_ptr();
+
+    const auto *running_mean_data =
+        running_mean->template data<BatchNormParamType<T>>();
+    const auto *running_var_data =
+        running_var->template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      auto px = x;
+      inplace_functor(data_layout,
+                      ctx.template Alloc<T>(&px),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      running_mean_data,
+                      running_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    if (compute_format == DataLayout::kNCHW) {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    } else {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &y_grad,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &saved_mean,
+                         const DenseTensor &saved_variance,
+                         paddle::optional<const DenseTensor &> reserve_space,
+                         paddle::optional<const DenseTensor &> mean,
+                         paddle::optional<const DenseTensor &> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string &data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context &ctx,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x,
+                               const DenseTensor &scale,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               paddle::optional<const DenseTensor &> mean,
+                               paddle::optional<const DenseTensor &> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string &data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor *x_grad,
+                               DenseTensor *scale_grad,
+                               DenseTensor *y_grad_grad) {
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const DenseTensor *running_mean = nullptr;
+  const DenseTensor *running_variance = nullptr;
+  if (use_global_stats) {
+    running_mean = mean.get_ptr();
+    running_variance = variance.get_ptr();
+  }
+  paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
+                                                       data_layout,
+                                                       &x,
+                                                       &scale,
+                                                       &y_grad,
+                                                       &saved_mean,
+                                                       &saved_variance,
+                                                       running_mean,
+                                                       running_variance,
+                                                       epsilon,
+                                                       use_global_stats,
+                                                       &x_grad_grad,
+                                                       &scale_grad_grad,
+                                                       &bias_grad_grad,
+                                                       x_grad,
+                                                       scale_grad,
+                                                       y_grad_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+
+#else
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
new file mode 100644
index 00000000000..6ad12245d2a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -0,0 +1,680 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardInference(const T *x,
+                                          const BatchNormParamType<T> *mean,
+                                          const BatchNormParamType<T> *variance,
+                                          const BatchNormParamType<T> *scale,
+                                          const BatchNormParamType<T> *bias,
+                                          const int C,
+                                          const int N,
+                                          const int HxW,
+                                          const double epsilon,
+                                          T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context &ctx,
+                     const DenseTensor &x,
+                     const DenseTensor &scale,
+                     const DenseTensor &bias,
+                     const DenseTensor &mean,
+                     const DenseTensor &variance,
+                     float momentum,
+                     float epsilon_f,
+                     const std::string &data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor *y,
+                     DenseTensor *mean_out,
+                     DenseTensor *variance_out,
+                     DenseTensor *saved_mean,
+                     DenseTensor *saved_variance,
+                     DenseTensor *reserve_space) {
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_stats);
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5"
+          "But received: the size of input's dimensions is [%d]",
+          x_dims.size()));
+
+  ctx.template Alloc<T>(y);
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm =
+      test_mode ||
+      (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
+
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_y(y->type());
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_y.ShareDataWith(*y);
+  }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+  if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+  } else if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#else
+  if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * D * C, 1, W * D * C, D * C, C};
+  }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  // Note: PERSISTENT not implemented for inference
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_,
+          data_desc_,
+          test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+#endif
+
+  auto handle = ctx.cudnn_handle();
+
+  // Now, depending on whether we are running test or not, we have two paths.
+  // It is training mode when it's not reference AND not using pre-trained
+  // model.
+  bool training = !test_mode && !use_global_stats;
+  if (!training) {
+    // only when test we use input to do computation.
+    const auto *est_mean = &mean;
+    const auto *est_var = &variance;
+    // Run inference mode.
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of mean's dimensions must equal to 1."
+            "But received: the size of mean's dimensions mean is [%d],"
+            "the dimensions of mean is [%s].",
+            est_mean->dims().size(),
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of variance's dimensions must equal to 1."
+            "But received: the size of variance's dimensions is [%d],"
+            "the dimensions of variance is [%s].",
+            est_var->dims().size(),
+            est_var->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of mean must equal to the number of "
+            "Channels, which is [%d]. But received: the first dimension"
+            "of mean is [%d], the dimensions of mean is [%s].",
+            C,
+            est_mean->dims()[0],
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of variance must equal to the number"
+            "of Channels, which is [%d]. But received: the first dimension of"
+            "variance is [%d], the dimensions of variance is [%s].",
+            C,
+            est_var->dims()[0],
+            est_var->dims()));
+
+#ifdef PADDLE_WITH_HIP
+    const int block_size = 256;
+    const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+    if (compute_format == DataLayout::kNCHW) {
+      BNForwardInference<
+          T,
+          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    } else {
+      BNForwardInference<
+          T,
+          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
+            handle,
+            // Note: PERSISTENT not implemented for inference
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            transformed_x.template data<T>(),
+            data_desc_,
+            ctx.template Alloc<T>(&transformed_y),
+            bn_param_desc_,
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            epsilon));
+#endif
+  } else {
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    // need to solve here
+    // if (ctx.HasInput("MomentumTensor")) {
+    //   const auto *mom_tensor = MomentumTensor;
+    //   DenseTensor mom_cpu;
+    //   paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+    //                                     &mom_cpu);
+    //   momentum = mom_cpu.data<float>()[0];
+    // }
+
+    // Run training mode.
+    // obtain running mean and running inv var, and there is no need
+    // to initialize them.
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    if ((N * H * W * D) == 1) {
+      // Only 1 element in normalization dimension,
+      // skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+    } else {
+      double this_factor = 1. - momentum;
+
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      size_t reserve_space_size = 0;
+      void *reserve_space_ptr = nullptr;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      // Create reserve space and workspace for batch norm.
+      // Create tensor for each batchnorm op, it will be used in the
+      // backward. Thus this tensor shouldn't be temp.
+      // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      PADDLE_ENFORCE_NOT_NULL(
+          reserve_space,
+          phi::errors::NotFound(
+              "The argument ReserveSpace of batch_norm op is not found."));
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*zDesc=*/nullptr,
+                  /*yDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      // -------------- cudnn batchnorm reserve space --------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*activationDesc=*/nullptr,
+                  /*xDesc=*/data_desc_,
+                  /*sizeInBytes=*/&reserve_space_size));
+
+      reserve_space_ptr = reserve_space->mutable_data(
+          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+              handle,
+              mode_,
+              CUDNN_BATCHNORM_OPS_BN,
+              CudnnDataType<T>::kOne(),
+              CudnnDataType<T>::kZero(),
+              data_desc_,
+              transformed_x.template data<T>(),
+              nullptr,
+              nullptr,
+              data_desc_,
+              transformed_y.template data<T>(),
+              bn_param_desc_,
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              this_factor,
+              mean_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              variance_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              epsilon,
+              saved_mean->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              saved_variance->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              nullptr,
+              workspace_ptr,
+              workspace_size,
+              reserve_space_ptr,
+              reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        const int num = transformed_x.numel();
+        const int block = 256;
+        const int max_threads = ctx.GetMaxPhysicalThreadCount();
+        const int max_blocks = std::max(max_threads / block, 1);
+        const int grid = std::min(C, max_blocks);
+        if (compute_format == DataLayout::kNCHW) {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        } else {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                mean_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                variance_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                saved_variance->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace())));
+#endif
+      }
+    }
+  }
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    TransToChannelLast<Context, T>(ctx, &transformed_y, y);
+  }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h
new file mode 100644
index 00000000000..c9c62026edf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_utils.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
new file mode 100644
index 00000000000..011d4c12ece
--- /dev/null
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature BatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "batch_norm_grad",
+      {GradVarName("Y"),
+       "X",
+       "Scale",
+       "Bias",
+       "SavedMean",
+       "SavedVariance",
+       "ReserveSpace",
+       "Mean",
+       "Variance"},
+      {"momentum",
+       "epsilon",
+       "data_layout",
+       "is_test",
+       "use_global_stats",
+       "trainable_statistics",
+       "fuse_with_relu"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+KernelSignature BatchNormGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm_grad_grad",
+                         {"DDX",
+                          "DDScale",
+                          "DDBias",
+                          "DY",
+                          "X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "Mean",
+                          "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"DX", "DScale", "DDY"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
+                           phi::BatchNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
+                           phi::BatchNormGradGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 30c1955adcf..c6f491a5484 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -520,6 +520,7 @@ def predict_static(args, data):
     paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
+
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
          args.model_save_dir,
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 4552d600baf..2b281d7d6f7 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
         for k, v in self.get_strategy().items():
             setattr(build_strategy, k, v)
         self.check_before_applied(main2, startup2)
+
         apply_build_strategy(main2, startup2, build_strategy,
                              {"use_cuda": self.use_cuda})
         self.check_after_applied(main2, startup2)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index cce13a8bf3b..b02df024518 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
 
     def test_check_output(self):
         places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 
     def test_check_output(self):
         places = []
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 places.append(place)
-
         for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            #for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
                 self.check_with_place(place, data_format, self.dtype,
                                       [2, 3, 4, 5])
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
         places = [core.CPUPlace()]
 
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 6a6f85a4832..c9abac8fb79 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -28,7 +28,7 @@ import paddle
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             with fluid.dygraph.guard(p):
@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_error(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             #paddle.disable_static()
@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
         else:
             paddle.set_default_dtype("float64")
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
 
     def tearDown(self):
@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
 class TestBatchNormUseGlobalStats(unittest.TestCase):
     def setUp(self):
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
         self.init_test()
 
@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 8ea4e369d32..826f886dab1 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle
 import paddle.fluid.core as core
@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
     TestWithDilation_AsyPadding, grad_check=False)
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index aee6ca249f5..a204c26c1b8 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 077496200d9..67f6b910214 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -23,6 +23,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid import compiler
 import paddle.fluid.unique_name as unique_name
+import paddle
 
 
 class TestInplaceANBOpTraining(unittest.TestCase):
@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
                 outs[0].name if not only_forward else None,
                 build_strategy=build_strategy,
                 exec_strategy=exec_strategy)
-            bn_fetches = exe.run(program=comp_prog1,
+            bn_fetches = exe.run(program=main,
                                  feed={'input': data},
                                  fetch_list=fetch_name)
             fetch_outs.append(bn_fetches)
             fetch_names.append(fetch_name)
 
-        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
-                                                           fetch_names)):
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(
+                fetch_outs + fetch_names)):
             self.assertTrue(
                 np.allclose(
                     bn_val, inplace_abn_val, atol=1e-2),
@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 
     def test_op(self):
         use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        #use_cudas = [False]
         for use_cuda in use_cudas:
             place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
             layouts = ["NCHW", "NHWC"]
@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index fe8c181b790..49fe397644d 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -21,6 +21,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
+import paddle
 
 from decorator_helper import prog_scope
 
@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index b01c7cf1799..a1a3b31a976 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
 import seresnext_net
 from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
 from fake_reader import fake_imdb_reader
+import paddle
 
 
 def lstm_net(use_feed):
@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index c860d6972fb..40481b09782 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From 71c69507cd9530cf49a72a8fcd083d2e8eb3e96b Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 3 Mar 2022 09:56:28 +0800
Subject: [PATCH 075/272] [Eager][YAML] Supported array-type parsing for output
 tensors (#40058)

---
 .../final_state_generator/eager_gen.py                    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 02183e2ca5c..f2088dcda76 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -213,8 +213,12 @@ def ParseYamlReturns(string):
 
     returns = [x.strip() for x in string.strip().split(",")]
     for i in range(len(returns)):
-        ret = returns[i]
-        returns_list.append(["", ret, i])
+        ret_type = returns[i]
+
+        assert ret_type in yaml_types_mapping.keys()
+        ret_type = yaml_types_mapping[ret_type]
+
+        returns_list.append(["", ret_type, i])
 
     return returns_list
 
-- 
GitLab


From 6bf85eafc8dc0ab57c87bbf51e7ac225ba05776c Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Thu, 3 Mar 2022 10:01:38 +0800
Subject: [PATCH 076/272] Implement SparseConv3d kernel (#39784)

* sparse conv3d: gpu code
---
 paddle/phi/core/sparse_coo_tensor.h           |   1 +
 paddle/phi/kernels/sparse/CMakeLists.txt      |   2 +-
 .../kernels/sparse/cpu/convolution_kernel.cc  |   4 +-
 .../kernels/sparse/gpu/convolution_kernel.cu  | 612 ++++++++++++++++++
 .../kernels/test_sparse_conv3d_dev_api.cc     | 102 +++
 5 files changed, 717 insertions(+), 4 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/gpu/convolution_kernel.cu

diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index 0dd5d543414..ca3290f33e6 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
   void* AllocateFrom(Allocator* allocator,
                      DataType dtype,
                      size_t requested_size = 0) override;
+  void set_dims(const DDim& dims) { this->dims_ = dims; }
 
  private:
   // save the indices of non zero elements in original dense tensor
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index 3e4a968b7a8..a319e9a13c3 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index fdf255bd542..93397d4c931 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
 namespace sparse {
@@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx,
   // 1. product rulebook
   DenseTensorMeta counter_meta(
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
 
   ProductRuleBook<T, Context>(dev_ctx,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
new file mode 100644
index 00000000000..aeb9409c417
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -0,0 +1,612 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "glog/logging.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
+__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    out1[i] = i;
+    out2[i] = i;
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  int* out_indices,
+                                  int* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+__global__ void ProductRuleBookKernel(const int* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      int* rulebook,
+                                      int* counter) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int batch = x_indices[i];
+          int in_z = x_indices[i + non_zero_num];
+          int in_y = x_indices[i + 2 * non_zero_num];
+          int in_x = x_indices[i + 3 * non_zero_num];
+          int in_i = -1, out_index = -1;
+          if (Check(x_dims,
+                    kernel_dims,
+                    paddings,
+                    dilations,
+                    strides,
+                    in_x,
+                    in_y,
+                    in_z,
+                    kx,
+                    ky,
+                    kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index =
+                PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+          }
+          rulebook[kernel_index * non_zero_num + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+// brief: calculation the distance between start and end
+__global__ void DistanceKernel(const int* start,
+                               const int* end,
+                               int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const DenseTensor& kernel,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  dev_ctx.Alloc(counter_per_kernel,
+                counter_per_kernel->dtype(),
+                sizeof(int) * counter_per_kernel->numel());
+  int* counter_ptr = counter_per_kernel->data<int>();
+  dev_ctx.Alloc(offsets_per_kernel,
+                offsets_per_kernel->dtype(),
+                sizeof(int) * offsets_per_kernel->numel());
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
+  dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          kernel_size * sizeof(int),
+                          dev_ctx.stream()>>>(indices_ptr,
+                                              d_x_dims,
+                                              d_kernel_dims,
+                                              d_out_dims,
+                                              non_zero_num,
+                                              d_paddings,
+                                              d_dilations,
+                                              d_strides,
+                                              rulebook_ptr,
+                                              counter_ptr);
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + 2 * kernel_size * non_zero_num,
+                             -1);
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+  int rulebook_len =
+      (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  dev_ctx.Alloc(
+      out_index, out_index->dtype(), sizeof(int) * out_index->numel());
+  int* out_index_ptr = out_index->data<int>();
+  dev_ctx.Alloc(
+      unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
+  int* unique_value_ptr = unique_value->data<int>();
+  dev_ctx.Alloc(
+      unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
+  int* unique_key_ptr = unique_key->data<int>();
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  InitByIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(
+      rulebook_len, out_index_ptr, unique_value_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
+                                     rulebook_ptr + rulebook_len,
+                                     rulebook_len * sizeof(int),
+                                     hipMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
+                                     rulebook_ptr + rulebook_len,
+                                     rulebook_len * sizeof(int),
+                                     cudaMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+#endif
+
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key_ptr,
+                      unique_key_ptr + rulebook_len,
+                      out_index_ptr);
+
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key_ptr,
+                            unique_key_ptr + rulebook_len,
+                            unique_value_ptr);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<<<1, 1>>>(unique_key_ptr,
+                           new_end.first,
+                           rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  dev_ctx.Alloc(
+      &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(unique_key_ptr,
+                                          unique_value_ptr,
+                                          out_index_ptr,
+                                          out_non_zero_num,
+                                          rulebook_len,
+                                          d_out_dims,
+                                          out_indices_ptr,
+                                          rulebook_ptr + rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  out->set_dims(out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensorMeta offsets_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
+  DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
+
+  int n = ProductRuleBook<T, Context>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      out_dims,
+                                      rulebook,
+                                      &counter_per_kernel,
+                                      &offsets_per_kernel,
+                                      &out_index,
+                                      &unique_key,
+                                      &unique_value,
+                                      out,
+                                      &h_counter,
+                                      &offsets);
+
+  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* offsets_ptr = counter_per_kernel.data<int>();
+
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  dev_ctx.Alloc(
+      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  dev_ctx.Alloc(
+      &out_features, out_features.dtype(), sizeof(T) * out_features.numel());
+  T* out_features_ptr = out_features.data<T>();
+
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook->data<int>(),
+                                             in_features_ptr,
+                                             n,
+                                             in_channels);
+
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto* out_values = out->mutable_non_zero_elements();
+  dev_ctx.Alloc(
+      out_values, out_values->dtype(), sizeof(T) * out_values->numel());
+  T* out_values_ptr = out_values->data<T>();
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (h_counter[i] <= 0) {
+      continue;
+    }
+
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = h_counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+
+  // 4. scatter
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, out->nnz() * out_channels, 1);
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(out_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         out->nnz(),
+                                         n,
+                                         out_channels,
+                                         out_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 00b2a256a95..ace95b55055 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
@@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector<int>& indices,
       f_verify(grads[1].data<T>(), kernel_grad);
     }
   }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  dev_ctx_gpu.Alloc(&d_indices_tensor,
+                    d_indices_tensor.dtype(),
+                    sizeof(int) * d_indices_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  dev_ctx_gpu.Alloc(&d_features_tensor,
+                    d_features_tensor.dtype(),
+                    sizeof(T) * d_features_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_kernel_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  dev_ctx_gpu.Alloc(&d_kernel_tensor,
+                    d_kernel_tensor.dtype(),
+                    sizeof(T) * d_kernel_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
+
+  DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
+  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
+                                            d_x_tensor,
+                                            d_kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  dev_ctx_cpu.Alloc(&h_indices_tensor,
+                    h_indices_tensor.dtype(),
+                    sizeof(int) * h_indices_tensor.numel());
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  dev_ctx_cpu.Alloc(&h_features_tensor,
+                    h_features_tensor.dtype(),
+                    sizeof(T) * h_features_tensor.numel());
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+    float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
+                                             h_features_tensor.data<T>()[i]));
+    ASSERT_LT(tmp, diff);
+  }
+#endif
 }
 
 void TestConv3d(const std::vector<int>& indices,
-- 
GitLab


From 909d1e617c36cf19822cb3b96ea14783cda6dfff Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:05:59 +0800
Subject: [PATCH 077/272] Modified Reduce for XPU2 (#38918)

1. set xpu2 block_size = 64
2. fix a bug when reduce_num is too large
---
 paddle/phi/kernels/gpu/reduce.h | 130 ++++++++++++++++++++------------
 1 file changed, 81 insertions(+), 49 deletions(-)

diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 9223a94c12a..94c2e980e36 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -178,6 +178,8 @@ struct IndexCalculator {
       : dim(dim) {
     dims = details::VectorToArray<int, kMaxRank>(cal_dims);
     strides = details::VectorToArray<int, kMaxRank>(full_strides);
+    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
+#ifndef PADDLE_WITH_XPU_KP
     std::vector<paddle::platform::FastDivMod> cal_divmoders;
     // fast divmod
     for (auto i : cal_strides) {
@@ -185,9 +187,22 @@ struct IndexCalculator {
     }
     divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
         cal_divmoders);
+#endif
   }
 
   __device__ inline int operator()(int offset) const {
+#ifdef PADDLE_WITH_XPU_KP
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      index += (offset / reduce_strides[i]) * strides[dims[i]];
+      offset = offset % reduce_strides[i];
+    }
+    return index;
+#else
     int index = 0;
 #pragma unroll
     for (int i = 0; i < kMaxRank; ++i) {
@@ -199,12 +214,16 @@ struct IndexCalculator {
       offset = divmod.val[1];
     }
     return index;
+#endif
   }
 
   int dim;
   phi::Array<int, kMaxRank> dims;
   phi::Array<int, kMaxRank> strides;
+  phi::Array<int, kMaxRank> reduce_strides;
+#ifndef PADDLE_WITH_XPU2
   phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+#endif
 };
 
 template <bool ReduceLastDim = false>
@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
 
   __device__ __forceinline__ int BlockDimY() {
 #ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_y;
+    return 1;
 #else
     return blockDim.y;
 #endif
@@ -454,10 +473,14 @@ struct ReduceConfig {
     bool is_last_dim =
         (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
     if (rank == reduce_rank || is_last_dim) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
       reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
     } else if (reduce_rank == 1) {
 // ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
       if (reduce_dim[0] == 0) {
         reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
       } else {
@@ -471,6 +494,7 @@ struct ReduceConfig {
     }
   }
 
+#ifndef PADDLE_WITH_XPU_KP
   void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
     constexpr int min_reduce_num_per_thread = 16;
     constexpr int max_reduce_num_per_thread = 256;
@@ -569,6 +593,7 @@ struct ReduceConfig {
       grid_dim->y = details::AlignUp(reduce_num, blocking_size);
     }
   }
+#endif
 
   void SetBlockDim() {
     // init
@@ -577,14 +602,14 @@ struct ReduceConfig {
     dim3 block_dim(block_num, 1, 1);
     dim3 grid_dim(left_num, 1, 1);
     blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     if (reduce_last_dim) {
-      block_dim.x = 128;
+      block_dim.x = 64;
       block_dim.y = reduce_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
+      grid_dim.x = 1;
+      grid_dim.y = 8;
     } else {
-      block_dim.x = 128;
+      block_dim.x = 64;
       block_dim.y = left_num;
       grid_dim.x = 8;
       grid_dim.y = 1;
@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
     store_offset = block.BlockIdY() * left_num + left_idx;
     loop_left = min(block.GetLoopSize(), left_num - left_idx);
     stride_left = 1;
-    tid = threadIdx.x;
+    tid = THREAD_ID_X;
   } else {
     auto block = ReduceIndexMapping<false>(dim);
     input_idx = block.BlockIdY() * block.BlockDimY();
@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
     loop_left = min(block.GetLoopSize(), left_num - left_idx);
     stride_left = block.BlockDimX() * block.GridDimX();
     store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = threadIdx.y;
+    tid = THREAD_ID_Y;
   }
   // calculate the offset, means the addr where each thread really start.
   // 1. reduce for each thread
   MPType input_compute[REDUCE_VEC_SIZE];
   Tx input_reg[REDUCE_VEC_SIZE];
+  int input_idx_tmp = input_idx;
   for (int i = 0; i < loop_left; i += stride_left) {
     int input_offset = left_index_calculator(left_idx + i);
-    const Tx* input = x + input_offset;
+    const _ptr_ Tx* input = x + input_offset;
     MPType reduce_var = init;
     // load REDUCE_VEC_SIZE data once, and then compute
     int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    input_idx = input_idx_tmp;
     for (; input_idx + block_size < bound;
          input_idx += REDUCE_VEC_SIZE * stride) {
       kps::ReadDataReduce<Tx,
@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
   int loop_size = min(reduce_num - idy, blocking_size);
   int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
   int block_offset = idy * left_num + idz * reduce_num;
-  const Tx* input = x + block_offset;
+  const _ptr_ Tx* input = x + block_offset;
   Tx reduce_input;
   for (; idx < size; idx += stride) {
     MPType reduce_var = init;
@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
                                const ReduceOp& reducer,
                                const TransformOp& transform,
                                MPType init,
-                               gpuStream_t stream,
+                               KPStream stream,
                                ReduceConfig<Ty> config) {
   if (config.reduce_type == kReduceLastDim) {
     int stride_reduce = 1;
@@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data,
                                         0);
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    OneDimIndexCal><<<8, 128, stream>>>(x_data,
-                                                        config.output_data,
-                                                        reducer,
-                                                        transform,
-                                                        init,
-                                                        config.reduce_num,
-                                                        config.left_num,
-                                                        config.reduce_last_dim,
-                                                        reduce_index_calculator,
-                                                        left_index_calculator,
-                                                        dim);
+                    OneDimIndexCal><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
 #else
     ReduceAnyKernel<Tx,
                     Ty,
@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
                                         0);
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    IndexCalculator><<<8, 128, stream>>>(
+                    IndexCalculator><<<8, 64, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
     kps::DimConfig dim =
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Ty,
-                          Ty,
-                          MPType,
-                          ReduceOp,
-                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
         config.output_data,
         y_data,
         reducer,
@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     const TransformOp& transform,
                     int reduce_num,
                     const paddle::platform::Place& place,
-                    gpuStream_t stream) {
+                    KPStream stream) {
   auto reducer = ReduceOp<Ty>();
   cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
                                                                   transform);
@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     const TransformOp& transform,
                     int reduce_num,
                     const paddle::platform::Place& place,
-                    gpuStream_t stream) {
+                    KPStream stream) {
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                       phi::DenseTensor* y,
                       const TransformOp& transform,
                       const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream) {
+                      KPStream stream) {
   y->mutable_data<Ty>(x.place());
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
   config.SetOutputData(y_data, x.place(), &tmp);
   constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
   bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+#ifndef PADDLE_WITH_XPU_KP
   if (use_cub_reduce) {
     CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
         x_data, y_data, transform, config.reduce_num, x.place(), stream);
     return;
   }
+#endif
 
   using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
   auto reducer = ReduceOp<MPType>();
@@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                config.reduce_num % config.blocking_size,
                0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     ReduceHigherDimKernel<Tx,
                           Ty,
                           MPType,
                           ReduceOp<MPType>,
-                          TransformOp><<<8, 128, stream>>>(x_data,
-                                                           config.output_data,
-                                                           reducer,
-                                                           transform,
-                                                           reducer.initial(),
-                                                           config.reduce_num,
-                                                           config.left_num,
-                                                           config.blocking_size,
-                                                           dim);
+                          TransformOp><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
 #else
     ReduceHigherDimKernel<
         Tx,
@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
           kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
       ReduceHigherDimKernel<
           Ty,
           Ty,
           MPType,
           ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
           config.output_data,
           y_data,
           reducer,
@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
 template <typename T,
           template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
-void Reduce(const GPUContext& dev_ctx,
+void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
             const std::vector<int64_t>& dims,
@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
     reduce_num *= (x.dims())[i];
   }
 
-  gpuStream_t stream = dev_ctx.stream();
+  KPStream stream = dev_ctx.stream();
 
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
-- 
GitLab


From d9884e2077d024a2439b8864b21885402f228af7 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 3 Mar 2022 10:06:11 +0800
Subject: [PATCH 078/272] adjust the args checking of backward in yaml (#40091)

---
 python/paddle/utils/code_gen/backward_api_gen.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 2d33cd5b181..125ebed82de 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI):
 
         # check the attributes of backward
         for attr in self.attrs['names']:
-            assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
-                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+            assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
+                 self.attrs['attr_info'][attr][1] is not None, \
+                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
                  Please check the args of {self.api} in yaml."
 
         # check the output of backward
-- 
GitLab


From da47544cc2bbc829b1c0f54854b532582d867156 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Thu, 3 Mar 2022 10:13:22 +0800
Subject: [PATCH 079/272] Support slim eager (#39874)

* eager, test=develop

* fix bug, test=develop

* eager, test=develop

* merge legacy to fluid

* eager, test=develop

* eager, test=develop

* Refactor TensorAdd func by template and remove gradient_accumulation in eager

* Remove needless target name

* eager, test=develop

* eager, test=develop

* Use overload instead of template

* Remove legacy code

* Remove legacy code

* selectedrows, test=develop

* Remove DataType test

* eager, test=develop

* eager, test=develop

* support gan, test=develop

* Using Tensor directly instead of using EagerTensor

* support gradient_accumulation

* make test_imperative_lod_tensor_to_selected_rows longer

* make test_imperative_lod_tensor_to_selected_rows longer

* refine code

* ptb, test=develop

* Rename all EagerTensor to Tensor

* Rename some EagerTensor to Tensor

* rename EagerTensor to EagerVariable

* eager, test=develop

* eager, test=develop

* eager, test=develop

* eager, test=develop

* add more test

* eager, test=develop

* Support copiable selected rows and merge develop

* save load, eager, test=develop

* save load, eager, test=develop

* refine, test=develop

* remove useless _set_value method

* refine, test=develop

* refine, test=develop

* revert static_runner, test=develop

* EagerTensor to Tensor, test=develop

* refine, test=develop

* refine, test=develop

* clear grad, test=develop

* merge, develop

* merge, develop

* merge, test=develop

* merge, test=develop

* Support quant and part of slice

* support legacy static save

* extend slim tests time

* remove imperative on inference

* remove imperative on inference

* merge develop

* fix typo

* fix typo

* split slice related code into 2 part for imperative and eager

* split slice from inference

* split slice from inference

* fix test_tensor_register_hook

Co-authored-by: Wang Huan <wanghuan29@baidu.com>
Co-authored-by: Weilong Wu <veyron_wu@163.com>
Co-authored-by: wanghuancoder <wanghuancoder@163.com>
---
 .../eager/accumulation/accumulation_node.h    |   5 +-
 .../eager_generated/backwards/scale_node.h    |   2 +-
 .../auto_code_generator/eager_generator.cc    |  67 ++--
 .../final_state_generator/eager_gen.py        |   7 +-
 paddle/fluid/eager/backward.cc                |   7 +-
 paddle/fluid/eager/grad_node_info.cc          |   9 +-
 paddle/fluid/eager/grad_node_info.h           |   4 +-
 .../data_structure_tests/grad_node_test.h     |   1 +
 paddle/fluid/eager/utils.cc                   |  15 +-
 paddle/fluid/pybind/eager_method.cc           | 142 ++++++++-
 paddle/fluid/pybind/eager_utils.cc            |   9 +
 paddle/fluid/pybind/eager_utils.h             |   5 +-
 paddle/fluid/pybind/imperative.cc             | 284 ++---------------
 paddle/fluid/pybind/pybind.cc                 |   5 +-
 paddle/fluid/pybind/slice_utils.h             | 294 ++++++++++++++++++
 .../fluid/contrib/slim/tests/CMakeLists.txt   |   8 +-
 .../slim/tests/test_imperative_out_scale.py   |  16 +-
 .../contrib/slim/tests/test_imperative_ptq.py |  15 +-
 .../contrib/slim/tests/test_imperative_qat.py |   9 +-
 .../slim/tests/test_imperative_qat_amp.py     |   2 +-
 .../tests/test_imperative_qat_user_defined.py |   9 +-
 .../slim/tests/test_imperative_skip_op.py     |   9 +-
 python/paddle/fluid/dygraph/base.py           |  17 +-
 .../dygraph_to_static/partial_program.py      |  99 ++++--
 python/paddle/fluid/dygraph/jit.py            |  53 ++--
 .../fluid/dygraph/varbase_patch_methods.py    |   4 +-
 python/paddle/fluid/io.py                     |   2 +-
 python/paddle/fluid/layers/nn.py              |   3 +
 python/paddle/fluid/layers/tensor.py          |   4 +-
 .../tests/unittests/test_egr_python_api.py    |   8 +-
 .../unittests/test_tensor_register_hook.py    |   8 +-
 31 files changed, 700 insertions(+), 422 deletions(-)
 create mode 100644 paddle/fluid/pybind/slice_utils.h

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 734cabdc3dc..07fa4016516 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
   explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    VLOG(6) << "Construct GradNodeAccumulation";
     weak_grad_ = meta->WeakGrad();
     SetDefaultGradInOutMeta();
   }
 
-  ~GradNodeAccumulation() override = default;
+  ~GradNodeAccumulation() override {
+    VLOG(6) << "Destruct GradNodeAccumulation";
+  }
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index c0150a1730d..247fde6ed1f 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
       const std::vector<paddle::experimental::Tensor>& tensors);
 
   void SetAttributes_scale(float scale);
-
+  std::string name() override { return ""; }
   // Members: define fwd input tensors
   // For Scale there is no fwd input tensor needed
  private:
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 102fad56373..2fc846cccc2 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent(
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
   std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  // If single output slotname and not duplicable,
+  // then generate: "egr::AutogradMeta* p_autograd_out =
+  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    const std::string& output_autograd_name = "p_autograd_" + output_name;
+
+    if (output.duplicable()) {
+      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
+          "  std::vector<egr::AutogradMeta*> %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    } else {
+      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+          "  egr::AutogradMeta* %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    }
+  }
+  VLOG(6) << "Generated outputs autograd_meta";
+
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent(
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
-  // If single output slotname and not duplicable,
-  // then generate: "egr::AutogradMeta* p_autograd_out =
-  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : out_vars) {
-    const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-
-    // Skip Intermediate Tensor
-
-    if (output.duplicable()) {
-      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    }
-  }
-  VLOG(6) << "Generated outputs autograd_meta";
-
   std::string prepare_autograd_meta_str = "";
   prepare_autograd_meta_str += get_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent(
       "  %s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
       "  if(require_any_grad) {\n"
+      "    VLOG(6) << \" Construct Grad for %s \"; \n"
       "    egr::EagerUtils::PassStopGradient(%s);\n"
       "%s\n  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, pass_stop_gradient_args,
+      compute_require_grad_args, op_type, pass_stop_gradient_args,
       grad_node_creation_str);
 
   return grad_node_creation_body_str;
@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents(
   const char* GRAD_NODE_TEMPLATE =
       "class GradNode%s : public egr::GradNodeBase {\n"
       " public:\n"
-      "  GradNode%s() : egr::GradNodeBase() {}\n"
+      "  GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
+      "GradNode%s \"; }\n"
       "  GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
-      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n"
-      "  ~GradNode%s() override = default;\n"
+      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
+      "Construct GradNode%s \"; }\n"
+      "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
       "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
       "override;\n"
       "\n"
+      "  std::string name() override { return \" GradNode%s \"; } \n "
+      "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
-      "  std::string name() { return \"GradNode%s\"; }\n"
-      "\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents(
   VLOG(6) << "Generated TensorWrapper";
 
   std::string grad_node_str = paddle::string::Sprintf(
-      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
+      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
       tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index f2088dcda76..af9540b6fb3 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -538,7 +538,7 @@ class {} : public egr::GradNodeBase {{
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
-  
+  std::string name() override {{ return \" {} \"; }}
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
@@ -553,8 +553,9 @@ class {} : public egr::GradNodeBase {{
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        set_tensor_wrapper_methods_str, set_attribute_methods_str,
-        tensor_wrapper_members_str, attribute_members_str)
+        grad_node_name, set_tensor_wrapper_methods_str,
+        set_attribute_methods_str, tensor_wrapper_members_str,
+        attribute_members_str)
 
     return node_declaration_str
 
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 356fdcaf054..934497d7d17 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
     }
     visited.insert(node);
 
+    PADDLE_ENFORCE_NOT_NULL(
+        node,
+        paddle::platform::errors::Fatal(
+            "We got null node when we traverse the backward graph, and this "
+            "should not happened please check your code and contact us."));
     // Find and append next nodes
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
     for (const auto& edge_list : edges) {
       for (const Edge& edge : edge_list) {
         GradNodeBase* next_node = edge.GetMutableGradNode().get();
-
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
       }
     }
   }
-
   return node_in_degree_map;
 }
 
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index b1189106b8f..427be83c3bb 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -30,6 +30,7 @@
 namespace egr {
 
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
+  VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
   // adj_edges has the same num as backward outputs
@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node) {
+      if (node && node.get()) {
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       }
@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "inputs's slot num."));
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node) {
+    if (node && node.get()) {
       VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
               << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index eeac1cca4ac..16513f05e07 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -76,10 +76,10 @@ class GradSlotMeta {
 
 class GradNodeBase {
  public:
-  GradNodeBase() = default;
+  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
   // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() = default;
+  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
 
   /**
    * operator() designed to contian the real backward execution logic, it should
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index bb84e2dda81..535c93ac53b 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode(float val, int in_num, int out_num)
       : GradNodeBase(in_num, out_num), val_(val) {}
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
+  std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
       override {
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 39861c80522..8a57d269453 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
-    if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
-      VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is "
-                 "detected";
+    if (autograd_meta->GradNode()) {
+      VLOG(7) << "Should not set grad node twice, original node is:"
+              << autograd_meta->GradNode()->name()
+              << "current is: " << grad_node->name();
     }
     autograd_meta->SetGradNode(grad_node);
   }
@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
 
 void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
-  if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
-    VLOG(6)
-        << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected";
+  if (autograd_meta->GradNode()) {
+    VLOG(7) << "Should not set grad node twice, original node is:"
+            << autograd_meta->GradNode()->name()
+            << "current is: " << grad_node->name();
   }
-
   autograd_meta->SetGradNode(grad_node);
 }
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f11a2ab2517..e5f22338dc6 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
@@ -30,10 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
 
 extern PyTypeObject* p_tensor_type;
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
+    paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
+    PADDLE_ENFORCE_EQ(
+        tensor.initialized(), true,
+        paddle::platform::errors::InvalidArgument(
+            "We can only support initialized tensor in slice, however we got "
+            "uninitialized tensor %s, please check your code.",
+            tensor.name()));
+    return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
+        CastPyArg2Tensor(obj, 0).impl().get())));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject* obj) {
+  return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
+}
+
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-// NOTE(wuweilong): Set value and not change self's original place
-static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
-                                         PyObject* kwargs) {
+static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
   EAGER_TRY
-  VLOG(4) << "Value " << self->tensor.name();
-  pybind11::object numpy_value =
-      pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
-  InitTensorWithNumpyValue(self, numpy_value, false);
-  Py_INCREF(Py_None);
-  return Py_None;
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
+  VLOG(4) << "Call _getitem_index_not_tensor";
+  std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
+      decrease_axis, none_axes, infer_flags, list_select_idxs;
+  // if index is a list, list_select_flag will be true
+  bool list_select_flag = false;
+  PADDLE_ENFORCE_EQ(
+      self->tensor.is_initialized(), true,
+      platform::errors::InvalidArgument(
+          "tensor %s has not been initialized, we can only slice initialized "
+          "tensor please init it first with numpy or other tensor.",
+          self->tensor.name()));
+  auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+  ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
+                     &slice_strides, &decrease_axis, &none_axes, &infer_flags,
+                     &list_select_idxs, &list_select_flag);
+
+  auto out = slice_axes.empty() && !list_select_flag
+                 ? self->tensor
+                 : paddle::experimental::Tensor(
+                       egr::Controller::Instance().GenerateUniqueName());
+
+  if (!slice_axes.empty()) {
+    framework::AttributeMap attrs = {{"axes", slice_axes},
+                                     {"starts", slice_starts},
+                                     {"ends", slice_ends},
+                                     {"infer_flags", infer_flags},
+                                     {"decrease_axis", decrease_axis}};
+    std::string op_type = "slice";
+    for (auto stride : slice_strides) {
+      if (stride != 1) {
+        op_type = "strided_slice";
+        attrs.insert({"strides", slice_strides});
+        attrs.erase("decrease_axis");
+        break;
+      }
+    }
+    if (op_type == "slice") {
+      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(),
+                                   std::move(attrs));
+    } else if (op_type == "strided_slice") {
+      out = strided_slice_dygraph_function(self->tensor, attrs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Slice is only support slice and strided_slice, but we got %s which "
+          "is impossible, please check your code first or contact us by "
+          "issue. ",
+          op_type));
+    }
+  }
+
+  if (!none_axes.empty()) {
+    // Deal with cases when all axes are decreased.
+    // After slice, the shape of out is [1], which should have been
+    // [], but Paddle doesn't support scalar.
+    // In order to ensure the correctness of the final shape of out,
+    // one dimension of out needs to be decreased.
+    // For example:
+    // # x.shape: (2,3,4)
+    // out = x[0, 1, 1, None] # out.shape : (1)
+    if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
+      none_axes.pop_back();
+    }
+    if (!none_axes.empty()) {
+      // Deal with cases that decrease_axes is not empty
+      // For example:
+      // # x.shape: (2,3,4)
+      // out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+      for (auto& axis : none_axes) {
+        int len = 0;
+        for (int da : decrease_axis) {
+          if (da < axis) {
+            len++;
+          }
+        }
+        axis -= len;
+      }
+
+      paddle::experimental::Tensor new_out;
+      framework::AttributeMap attrs = {{"axes", none_axes}};
+      new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
+      return ToPyObject(new_out);
+    }
+  }
+
+  // the index is a list
+  if (list_select_flag) {
+    auto select_index = paddle::experimental::Tensor(
+        egr::Controller::Instance().GenerateUniqueName());
+    auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
+        egr::Controller::Instance().GetExpectedPlace());
+    paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
+                                        idx_tensor.get());
+    framework::AttributeMap attrs = {{"dim", 0}};
+    out = index_select_dygraph_function(self->tensor, select_index,
+                                        std::move(attrs));
+  }
+
+  return ToPyObject(out);
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
     {"get_tensor",
      (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
+    {"_getitem_index_not_tensor",
+     (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_register_grad_hook",
      (PyCFunction)(void (*)(void))tensor_register_grad_hook,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c1e8822eec2..57f37621d3b 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   }
 }
 
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos) {
+  return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
+}
+
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos) {
   std::vector<paddle::experimental::Tensor> result;
@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
 
   return result;
 }
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 0c721d61247..92afc3ae487 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-
 namespace paddle {
 namespace pybind {
 
@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos);
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
+// end of Slice related methods
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8c5ed2d1183..3da17b95a66 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -54,6 +54,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
 namespace paddle {
@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
   }
 }
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
+  if (py::isinstance<imperative::VarBase>(obj)) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
+    return GetSliceIndexFromTensor(
+        py::cast<std::shared_ptr<imperative::VarBase>>(obj)
+            ->Var()
+            .Get<framework::LoDTensor>());
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject *obj) {
+  return py::isinstance<imperative::VarBase>(obj);
+}
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
 
   return result;
 }
-static bool IsNumpyType(PyObject *obj) {
-  // It is not a good way to judge the type of obj by its type'name. Maybe using
-  // `PyArray_IsScalar` will be better. However, this interface cannot be used
-  // by including pybind11, and it needs to compile with numpy.
-  auto type_name = std::string(Py_TYPE(obj)->tp_name);
-  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
-         type_name == "numpy.int32" || type_name == "numpy.int16";
-}
-
-static bool PyCheckTensor(PyObject *obj) {
-  return py::isinstance<imperative::VarBase>(obj);
-}
 
 // cast numpy type form S to T, this may allocate new memory
 template <class T, class S>
@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
-static bool PyCheckInteger(PyObject *obj) {
-#if PY_VERSION_HEX < 0x03000000
-  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
-#else
-  return PyLong_Check(obj) && !PyBool_Check(obj);
-#endif
-}
-
-static Py_ssize_t GetSliceIndexFromTensor(
-    const std::shared_ptr<imperative::VarBase> &tensor_index) {
-  const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
-  if (tensor.numel() == 1) {
-    if (framework::TransToProtoVarType(tensor.dtype()) ==
-        framework::proto::VarType::INT32) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
-    } else if (framework::TransToProtoVarType(tensor.dtype()) ==
-               framework::proto::VarType::INT64) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, the type of tensor in slice indices only allows "
-          "int32 and int64, please check the type of index tensor."));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Currently, tensor in slice indices only allows 1 element, "
-        "but received %d.",
-        tensor.numel()));
-  }
-}
-
-// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
-// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
-// Original PySlice_GetIndices return wrong result when
-// slice_item contains long int, such as arr[:180L].
-// NOT sure why this happens !!!
-// Besides, PySlice_GetIndices cannot raise error when float in slice item.
-// So, I make a revised version of PySlice_GetIndices, named to
-// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
-// PySlice_GetIndices in the future.
-static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
-                               Py_ssize_t *start, Py_ssize_t *stop,
-                               Py_ssize_t *step) {
-  /* XXX support long ints */
-  if (r->step == Py_None) {
-    *step = 1;
-  } else {
-    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
-      *step = PyLong_AsLong(r->step);
-    } else if (PyCheckTensor(r->step)) {
-      *step = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->step)->tp_name)));
-    }
-  }
-  if (r->start == Py_None) {
-    *start = *step < 0 ? length - 1 : 0;
-  } else {
-    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
-      *start = PyLong_AsLong(r->start);
-    } else if (PyCheckTensor(r->start)) {
-      *start = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->start)->tp_name)));
-    }
-    if (*start < 0) *start += length;
-    *start = std::max(*start, static_cast<Py_ssize_t>(0));
-  }
-  if (r->stop == Py_None) {
-    *stop = *step < 0 ? -1 : length;
-  } else {
-    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
-      *stop = PyLong_AsLong(r->stop);
-    } else if (PyCheckTensor(r->stop)) {
-      *stop = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->stop)->tp_name)));
-    }
-    if (0 < *step && *stop < 0) *stop += length;
-    *stop = std::min(*stop, length);
-  }
-  if (*stop > length) return -1;
-  if (*start >= length) return -1;
-  if (*step == 0) return -1;
-  return 0;
-}
-
-static void ParseIndexingSlice(
-    framework::LoDTensor *tensor, PyObject *_index,
-    std::vector<int> *slice_axes, std::vector<int> *slice_starts,
-    std::vector<int> *slice_ends, std::vector<int> *slice_strides,
-    std::vector<int> *decrease_axis, std::vector<int> *none_axes,
-    std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
-    bool *list_select_flag) {
-  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
-  // types, and list of Bool and Integers.
-  // wrap to tuple
-
-  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
-  PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
-  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
-    if (!PyTuple_Check(_index)) {
-      Py_DECREF(index);
-      VLOG(4) << "Call Py_DECREF";
-    }
-  });
-  PADDLE_ENFORCE_EQ(
-      tensor->IsInitialized(), true,
-      platform::errors::InvalidArgument("tensor has not been initialized"));
-  const auto &shape = tensor->dims();
-  const int rank = shape.size();
-  const int size = PyTuple_GET_SIZE(index);
-
-  // specified_dims is the number of dimensions which indexed by Interger,
-  // Slices.
-  int specified_dims = 0;
-  int ell_count = 0;
-  for (int dim = 0; dim < size; ++dim) {
-    PyObject *slice_item = PyTuple_GetItem(index, dim);
-    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
-      specified_dims++;
-    } else if (slice_item == Py_Ellipsis) {
-      ell_count++;
-    }
-  }
-
-  PADDLE_ENFORCE_LE(ell_count, 1,
-                    platform::errors::InvalidArgument(
-                        "An index can only have a single ellipsis ('...')"));
-  int none_count = 0;
-  for (int i = 0, dim = 0; i < size; ++i) {
-    PyObject *slice_item = PyTuple_GetItem(index, i);
-
-    infer_flags->push_back(1);
-    int dim_len = shape[dim];
-    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
-      // integer, PyLong_AsLong supports both int and long
-      int start = static_cast<int>(PyLong_AsLong(slice_item));
-      auto s_t = start;
-      start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(start + 1);
-      slice_strides->push_back(1);
-      decrease_axis->push_back(dim);
-      dim++;
-    } else if (PySlice_Check(slice_item)) {
-      // slice item
-      Py_ssize_t start, end, step;
-      PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
-      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
-
-      // :: or : or 0:dim_len:1
-      if (start == 0 && end == dim_len && step == 1) {
-        dim++;
-        continue;
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(end);
-      slice_strides->push_back(step);
-      dim++;
-    } else if (slice_item == Py_Ellipsis) {
-      dim += rank - specified_dims;
-    } else if (slice_item == Py_None) {
-      none_axes->push_back(dim + none_count);
-      none_count++;
-    } else if (PyList_Check(slice_item)) {
-      *list_select_flag = true;
-      PADDLE_ENFORCE_EQ(
-          size, 1,
-          platform::errors::InvalidArgument(
-              "When index contains a list, its length is excepted to 1, "
-              "but received %d",
-              size));
-      bool all_bool = true;
-      int list_size = PyList_GET_SIZE(slice_item);
-      for (int j = 0; j < list_size; ++j) {
-        PyObject *list_item = PyList_GetItem(slice_item, j);
-        if (PyCheckInteger(list_item)) {
-          all_bool = false;
-        } else if (!PyBool_Check(list_item)) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support int or bool in index list."));
-        }
-      }
-      if (all_bool) {
-        PADDLE_ENFORCE_EQ(
-            list_size, shape[0],
-            platform::errors::InvalidArgument(
-                "The dimension of bool index doesn't match indexed array along "
-                "dimension 0, the target dimension is %d, but received %d.",
-                shape[0], list_size));
-
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (list_item == Py_True) {
-            list_select_idxs->push_back(j);
-          }
-        }
-      } else {
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (PyCheckInteger(list_item)) {
-            list_select_idxs->push_back(
-                static_cast<int>(PyLong_AsLong(list_item)));
-          } else if (list_item == Py_True) {
-            list_select_idxs->push_back(1);
-          } else {
-            list_select_idxs->push_back(0);
-          }
-        }
-      }
-
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, Tensor.__indices__() only allows indexing "
-          "by Integers, Slices, Ellipsis, None, tuples of these types "
-          "and list of Bool and Integers, but received "
-          "%s in %dth slice item",
-          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
-    }
-  }
-
-  // valid_index is the number of dimensions exclude None index
-  const int valid_indexs = size - none_axes->size() - ell_count;
-  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
-                    platform::errors::InvalidArgument(
-                        "Too many indices (%d) for tensor of dimension %d.",
-                        valid_indexs, rank));
-}
-
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2d9272dd0ed..ffc42dc30ed 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -80,6 +80,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
@@ -101,7 +102,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/metrics_py.h"
@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+  BindImperative(&m);
   BindEager(&m);
   BindCudaStream(&m);
 
@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
-  BindImperative(&m);
-
   py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                  py::buffer_protocol());
   g_framework_tensor_pytype =
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
new file mode 100644
index 00000000000..a037fa13eb5
--- /dev/null
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -0,0 +1,294 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+static bool PyCheckTensor(PyObject* obj);
+static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
+// Slice related methods
+static bool PyCheckInteger(PyObject* obj) {
+#if PY_VERSION_HEX < 0x03000000
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+
+static bool IsNumpyType(PyObject* obj) {
+  // It is not a good way to judge the type of obj by its type'name. Maybe using
+  // `PyArray_IsScalar` will be better. However, this interface cannot be used
+  // by including pybind11, and it needs to compile with numpy.
+  auto type_name = std::string(Py_TYPE(obj)->tp_name);
+  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
+         type_name == "numpy.int32" || type_name == "numpy.int16";
+}
+
+static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
+  if (tensor.numel() == 1) {
+    if (framework::TransToProtoVarType(tensor.type()) ==
+        framework::proto::VarType::INT32) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
+    } else if (framework::TransToProtoVarType(tensor.type()) ==
+               framework::proto::VarType::INT64) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, the type of tensor in slice indices only allows "
+          "int32 and int64, please check the type of index tensor."));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Currently, tensor in slice indices only allows 1 element, "
+        "but received %d.",
+        tensor.numel()));
+  }
+}
+
+// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
+// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
+// Original PySlice_GetIndices return wrong result when
+// slice_item contains long int, such as arr[:180L].
+// NOT sure why this happens !!!
+// Besides, PySlice_GetIndices cannot raise error when float in slice item.
+// So, I make a revised version of PySlice_GetIndices, named to
+// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
+// PySlice_GetIndices in the future.
+static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
+                               Py_ssize_t* start, Py_ssize_t* stop,
+                               Py_ssize_t* step) {
+  /* XXX support long ints */
+  if (r->step == Py_None) {
+    *step = 1;
+  } else {
+    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
+      *step = PyLong_AsLong(r->step);
+    } else if (PyCheckTensor(r->step)) {
+      *step = GetSliceIndexFromPyObject(r->step);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->step)->tp_name)));
+    }
+  }
+  if (r->start == Py_None) {
+    *start = *step < 0 ? length - 1 : 0;
+  } else {
+    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
+      *start = PyLong_AsLong(r->start);
+    } else if (PyCheckTensor(r->start)) {
+      *start = GetSliceIndexFromPyObject(r->start);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->start)->tp_name)));
+    }
+    if (*start < 0) *start += length;
+    *start = std::max(*start, static_cast<Py_ssize_t>(0));
+  }
+  if (r->stop == Py_None) {
+    *stop = *step < 0 ? -1 : length;
+  } else {
+    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
+      *stop = PyLong_AsLong(r->stop);
+    } else if (PyCheckTensor(r->stop)) {
+      *stop = GetSliceIndexFromPyObject(r->stop);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->stop)->tp_name)));
+    }
+    if (0 < *step && *stop < 0) *stop += length;
+    *stop = std::min(*stop, length);
+  }
+  if (*stop > length) return -1;
+  if (*start >= length) return -1;
+  if (*step == 0) return -1;
+  return 0;
+}
+
+static void ParseIndexingSlice(
+    framework::LoDTensor* tensor, PyObject* _index,
+    std::vector<int>* slice_axes, std::vector<int>* slice_starts,
+    std::vector<int>* slice_ends, std::vector<int>* slice_strides,
+    std::vector<int>* decrease_axis, std::vector<int>* none_axes,
+    std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
+    bool* list_select_flag) {
+  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
+  // types, and list of Bool and Integers.
+  // wrap to tuple
+
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
+  PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("tensor has not been initialized"));
+  const auto& shape = tensor->dims();
+  const int rank = shape.size();
+  const int size = PyTuple_GET_SIZE(index);
+
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
+  int ell_count = 0;
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index, dim);
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    } else if (slice_item == Py_Ellipsis) {
+      ell_count++;
+    }
+  }
+
+  PADDLE_ENFORCE_LE(ell_count, 1,
+                    platform::errors::InvalidArgument(
+                        "An index can only have a single ellipsis ('...')"));
+  int none_count = 0;
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject* slice_item = PyTuple_GetItem(index, i);
+
+    infer_flags->push_back(1);
+    int dim_len = shape[dim];
+    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
+      // integer, PyLong_AsLong supports both int and long
+      int start = static_cast<int>(PyLong_AsLong(slice_item));
+      auto s_t = start;
+      start = start < 0 ? start + dim_len : start;
+      if (start >= dim_len || start < 0) {
+        std::string str_error_message =
+            "The starting index " + std::to_string(s_t) +
+            " of slice is out of bounds in tensor " + std::to_string(dim) +
+            "-th axis, it shound be in the range of [" +
+            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
+        // py::index_error is corresponding to IndexError in Python
+        // Used to indicate out of bounds access in __getitem__, __setitem__
+        throw py::index_error(str_error_message);
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(start + 1);
+      slice_strides->push_back(1);
+      decrease_axis->push_back(dim);
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
+      // slice item
+      Py_ssize_t start, end, step;
+      PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
+      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
+
+      // :: or : or 0:dim_len:1
+      if (start == 0 && end == dim_len && step == 1) {
+        dim++;
+        continue;
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(end);
+      slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else if (slice_item == Py_None) {
+      none_axes->push_back(dim + none_count);
+      none_count++;
+    } else if (PyList_Check(slice_item)) {
+      *list_select_flag = true;
+      PADDLE_ENFORCE_EQ(
+          size, 1,
+          platform::errors::InvalidArgument(
+              "When index contains a list, its length is excepted to 1, "
+              "but received %d",
+              size));
+      bool all_bool = true;
+      int list_size = PyList_GET_SIZE(slice_item);
+      for (int j = 0; j < list_size; ++j) {
+        PyObject* list_item = PyList_GetItem(slice_item, j);
+        if (PyCheckInteger(list_item)) {
+          all_bool = false;
+        } else if (!PyBool_Check(list_item)) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support int or bool in index list."));
+        }
+      }
+      if (all_bool) {
+        PADDLE_ENFORCE_EQ(
+            list_size, shape[0],
+            platform::errors::InvalidArgument(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension 0, the target dimension is %d, but received %d.",
+                shape[0], list_size));
+
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (list_item == Py_True) {
+            list_select_idxs->push_back(j);
+          }
+        }
+      } else {
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (PyCheckInteger(list_item)) {
+            list_select_idxs->push_back(
+                static_cast<int>(PyLong_AsLong(list_item)));
+          } else if (list_item == Py_True) {
+            list_select_idxs->push_back(1);
+          } else {
+            list_select_idxs->push_back(0);
+          }
+        }
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, Tensor.__indices__() only allows indexing "
+          "by Integers, Slices, Ellipsis, None, tuples of these types "
+          "and list of Bool and Integers, but received "
+          "%s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
+    }
+  }
+
+  // valid_index is the number of dimensions exclude None index
+  const int valid_indexs = size - none_axes->size() - ell_count;
+  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
+                    platform::errors::InvalidArgument(
+                        "Too many indices (%d) for tensor of dimension %d.",
+                        valid_indexs, rank));
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index f75a0fa50a5..807f7c15196 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -351,10 +351,10 @@ endif()
 
 set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
 set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
-set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
+set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
     set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index c4318b8bf8e..7b9cd7958b2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -26,7 +26,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
+from paddle.fluid.framework import IrGraph, _test_eager_guard
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
         seed = 1000
         lr = 0.001
 
@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
-    def test_save_quantized_model(self):
+    def func_save_quantized_model(self):
         lr = 0.001
 
         load_param_path = "test_save_quantized_model/lenet.pdparams"
@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_save_quantized_model(self):
+        with _test_eager_guard():
+            self.func_save_quantized_model()
+        self.func_save_quantized_model()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
index fb92b12cb0d..fad4c8f9d58 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -29,6 +29,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import *
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
 
 from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
 from imperative_test_utils import ImperativeLinearBn_hook
@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase):
                 break
         return top1_correct_num / total_num
 
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQfuse(TestImperativePTQ):
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQHist(TestImperativePTQ):
     def set_vars(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 677ccb52e24..5db720b028f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 paddle.enable_static()
@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase):
         self.activation_quantize_type = 'moving_average_abs_max'
         print('weight_quantize_type', self.weight_quantize_type)
 
-    def test_qat(self):
+    def func_qat(self):
         self.set_vars()
 
         imperative_qat = ImperativeQuantAware(
@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase):
                 np.allclose(after_save, before_save.numpy()),
                 msg='Failed to save the inference quantized model.')
 
+    def test_qat(self):
+        with _test_eager_guard():
+            self.func_qat()
+        self.func_qat()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index d1bf76f4724..2dcf7a6f168 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -27,7 +27,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 os.environ["CPU_NUM"] = "1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 270e8ee566a..0bc80694a12 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D
 from paddle.fluid.dygraph import Linear
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
-
+from paddle.fluid.framework import _test_eager_guard
 os.environ["CPU_NUM"] = "1"
 
 _logger = get_logger(
@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
         _logger.info("test act_preprocess")
         self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
 
-    def test_quant_aware_training(self):
+    def func_quant_aware_training(self):
         imperative_qat = self.imperative_qat
         seed = 1
         np.random.seed(seed)
@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
         train(lenet)
         test(lenet)
 
+    def test_quant_aware_training(self):
+        with _test_eager_guard():
+            self.func_quant_aware_training()
+        self.func_quant_aware_training()
+
 
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
     def setUp(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 8d2e0f753c0..d77134d72a9 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+from paddle.fluid.framework import _test_eager_guard
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
@@ -42,7 +43,8 @@ _logger = get_logger(
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
+        paddle.disable_static()
         seed = 1000
         lr = 0.1
 
@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase):
         if find_matmul:
             self.assertTrue(matmul_skip_count == 1)
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8c2ff140ea4..8149d69d36a 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -99,18 +99,19 @@ def param_guard(parameters):
         yield
 
 
-def _convert_into_variable(var_base):
+def _convert_into_variable(tensor):
     """
     Convert Varbase into Variable.
     """
-    if isinstance(var_base, core.VarBase):
+    if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
         # Check whether has been created before.
-        new_var = var_base.block._find_var_recursive(var_base.name)
+        new_var = tensor.block._find_var_recursive(tensor.name)
         if new_var is not None:
             assert isinstance(new_var, framework.Variable)
         # Convert ParamBase into Parameter with same attributes in dy2stat.
-        elif isinstance(var_base, framework.ParamBase):
-            new_var = var_base._to_static_var(to_parameter=True)
+        elif isinstance(tensor,
+                        (framework.EagerParamBase, framework.ParamBase)):
+            new_var = tensor._to_static_var(to_parameter=True)
         else:
             # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
             # same attributes and set persistable=True to allow saving this var.
@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
 
             # But if its shape is empty while created from `create_variable()`, we consider this buffer
             # non-persistable. See case of `drop_state` in lstm api.
-            is_persistable = len(var_base.shape) > 0
+            is_persistable = len(tensor.shape) > 0
 
-            new_var = var_base._to_static_var(
+            new_var = tensor._to_static_var(
                 to_parameter=False, persistable=is_persistable)
         return new_var
     else:
-        return var_base
+        return tensor
 
 
 def enabled():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 94fc5558ab1..a442a8b92b6 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -61,7 +61,8 @@ class NestSequence(object):
     def _get_var_ids(self):
         var_ids = []
         for idx, var in enumerate(self.__input_list):
-            if isinstance(var, (framework.Variable, core.VarBase)):
+            if isinstance(var, (framework.Variable, core.VarBase,
+                                core.eager.Tensor)):
                 var_ids.append(idx)
 
         return var_ids
@@ -73,7 +74,8 @@ class NestSequence(object):
         if need_check:
             warning_types = set()
             for var in self.__input_list:
-                if not isinstance(var, (framework.Variable, core.VarBase)):
+                if not isinstance(var, (framework.Variable, core.VarBase,
+                                        core.eager.Tensor)):
                     warning_types.add(type(var))
             if warning_types:
                 logging_utils.warn(
@@ -301,10 +303,17 @@ class PartialProgramLayer:
             for name in block.vars:
                 if "@GRAD" in name:
                     var_desc = block.vars[name].desc
-                    var_base = core.VarBase(var_desc.dtype(),
-                                            var_desc.shape(),
-                                            var_desc.name(),
-                                            var_desc.type(), False)
+                    var_base = None
+                    if not core._in_eager_mode():
+                        var_base = core.VarBase(var_desc.dtype(),
+                                                var_desc.shape(),
+                                                var_desc.name(),
+                                                var_desc.type(), False)
+                    else:
+                        var_base = core.eager.Tensor(var_desc.dtype(),
+                                                     var_desc.shape(),
+                                                     var_desc.name(),
+                                                     var_desc.type(), False)
                     double_grads.append(var_base)
         return self._valid_vars(double_grads)
 
@@ -386,13 +395,22 @@ class PartialProgramLayer:
         expected_place = framework._current_expected_place()
         for i, value in enumerate(flatten_inputs):
             if isinstance(value, np.ndarray):
-                var = core.VarBase(
-                    value=value,
-                    name=self._inputs[i].desc.name(),
-                    persistable=False,
-                    place=expected_place,
-                    zero_copy=True)
-            elif isinstance(value, core.VarBase):
+                var = None
+                if not core._in_eager_mode():
+                    var = core.VarBase(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+                else:
+                    var = core.eager.Tensor(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+            elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                 # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                 # into CUDAPlace when it's as input of multi Ops. so we move it in advance
                 # to avoid this problem.
@@ -411,9 +429,16 @@ class PartialProgramLayer:
             var = self._outputs[var_id]
             assert isinstance(var, framework.Variable)
             var_desc = var.desc
-            var_base = core.VarBase(var_desc.dtype(),
-                                    var_desc.shape(),
-                                    var_desc.name(), var_desc.type(), False)
+            varbase = None
+            if not core._in_eager_mode():
+                var_base = core.VarBase(var_desc.dtype(),
+                                        var_desc.shape(),
+                                        var_desc.name(), var_desc.type(), False)
+            else:
+                var_base = core.eager.Tensor(var_desc.dtype(),
+                                             var_desc.shape(),
+                                             var_desc.name(),
+                                             var_desc.type(), False)
             return var_base
 
         # Create VarBase to receive output data.
@@ -423,12 +448,19 @@ class PartialProgramLayer:
 
     def _create_scope_vec(self):
         # Hold forward variables
-        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                     "program_out_scope",
-                                     core.VarDesc.VarType.STEP_SCOPES, True)
-
-        inner_scope = core.Scope()
-        tmp_scope_vec.value().set_scope(inner_scope)
+        tmp_scope_vec = None
+        if not core._in_eager_mode():
+            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                         "program_out_scope",
+                                         core.VarDesc.VarType.STEP_SCOPES, True)
+            # TODO(jiabin): Support this later.
+            # else:
+            #     tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
+            #                                 "program_out_scope",
+            #                                 core.VarDesc.VarType.STEP_SCOPES, True)
+
+            inner_scope = core.Scope()
+            tmp_scope_vec.value().set_scope(inner_scope)
         return tmp_scope_vec
 
     def _restore_out(self, out_vars):
@@ -450,7 +482,8 @@ class PartialProgramLayer:
         return main_program.clone(for_test=True)
 
     def _is_no_value(self, var):
-        if isinstance(var, core.VarBase) and var.shape == [1]:
+        if isinstance(var,
+                      (core.VarBase, core.eager.Tensor)) and var.shape == [1]:
             # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
             if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
                 return True
@@ -460,7 +493,7 @@ class PartialProgramLayer:
         """
         Removes invalid value for various-length return statement
         """
-        if isinstance(out_vars, core.VarBase):
+        if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
             if self._is_no_value(out_vars):
                 return None
             return out_vars
@@ -527,7 +560,7 @@ class PartialProgramLayer:
         param_and_buffer_names_set = set()
         for i, var in enumerate(self._params):
             # self._params constains parameters and buffers with persistable=True.
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                 raise TypeError(
                     'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
                     format(i, type(var)))
@@ -559,10 +592,18 @@ def _create_fake_var():
     """
     Create a fake_var (force on CPU) to handle empty input or output
     """
-    return [
-        core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
-                     core.VarDesc.VarType.RAW, False)
-    ]
+    if not core._in_eager_mode():
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return []
+        # TODO(jiabin): Support this later
+        # return [
+        #     core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+        #                 core.VarDesc.VarType.RAW, False)
+        # ]
 
 
 def partial_program_from(concrete_program):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4bfdc3c27fa..b1865691b24 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -25,7 +25,7 @@ import threading
 
 import six
 import paddle
-from paddle.fluid import core
+from paddle.fluid import core, dygraph
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten, pack_sequence_as
@@ -898,30 +898,33 @@ def save(layer, path, input_spec=None, **configs):
                 state_var_dict[var.name] = var
 
             # 3. share parameters from Layer to scope & record var info
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
-                    scr_tensor = param_or_buffer.value().get_map_tensor()
-                    tgt_var = scope.var(param_or_buffer.name)
-                    tgt_var.set_vocab(scr_tensor)
-                else:
-                    param_or_buffer_tensor = scope.var(
-                        param_or_buffer.name).get_tensor()
-                    #src_tensor = param_or_buffer.value().get_tensor()
-                    src_tensor = state_var_dict[param_or_buffer.name].value(
-                    ).get_tensor()
-                    param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                if param_or_buffer.name not in extra_var_info:
-                    extra_info_dict = dict()
-                    if param_or_buffer.name in state_names_dict:
-                        extra_info_dict['structured_name'] = state_names_dict[
-                            param_or_buffer.name]
-                    extra_info_dict[
-                        'stop_gradient'] = param_or_buffer.stop_gradient
-                    if isinstance(param_or_buffer, ParamBase):
-                        extra_info_dict['trainable'] = param_or_buffer.trainable
-                    extra_var_info[param_or_buffer.name] = extra_info_dict
+            with dygraph.guard():
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
+                        scr_tensor = param_or_buffer.value().get_map_tensor()
+                        tgt_var = scope.var(param_or_buffer.name)
+                        tgt_var.set_vocab(scr_tensor)
+                    else:
+                        param_or_buffer_tensor = scope.var(
+                            param_or_buffer.name).get_tensor()
+                        #src_tensor = param_or_buffer.value().get_tensor()
+                        src_tensor = state_var_dict[param_or_buffer.name].value(
+                        ).get_tensor()
+                        param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    if param_or_buffer.name not in extra_var_info:
+                        extra_info_dict = dict()
+                        if param_or_buffer.name in state_names_dict:
+                            extra_info_dict[
+                                'structured_name'] = state_names_dict[
+                                    param_or_buffer.name]
+                        extra_info_dict[
+                            'stop_gradient'] = param_or_buffer.stop_gradient
+                        if isinstance(param_or_buffer, ParamBase):
+                            extra_info_dict[
+                                'trainable'] = param_or_buffer.trainable
+                        extra_var_info[param_or_buffer.name] = extra_info_dict
 
         # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 65bfba3f6c3..6843c0e4c3f 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -94,7 +94,7 @@ def monkey_patch_varbase():
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T']
-        if isinstance(self, ParamBase):
+        if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
         else:
             attr_names = []
@@ -111,7 +111,7 @@ def monkey_patch_varbase():
 
         attr_kwargs.update(kwargs)
 
-        if to_parameter or isinstance(self, ParamBase):
+        if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
             del attr_kwargs['persistable']
             # NOTE(Aurelius84): All parameters should be placed into global block.
             attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 4bbc0ba03c9..a48cfd9150c 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
 @static_only
 def _legacy_save(param_dict, model_path, protocol=2):
     def get_tensor(var):
-        if isinstance(var, core.VarBase):
+        if isinstance(var, (core.VarBase, core.eager.Tensor)):
             return var.numpy()
         elif isinstance(var, core.LoDTensor):
             return np.array(var)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f022e1791da..fd7226c4866 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
         'flatten')
+    if in_dygraph_mode():
+        return _C_ops.flatten2(x, 'axis', axis)[0]
+
     helper = LayerHelper('flatten', **locals())
 
     if not (isinstance(x, Variable)):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 76414ea9424..c63ad42288f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -663,7 +663,9 @@ def assign(input, output=None):
             })
 
     if is_inplace and in_dygraph_mode():
-        output._bump_inplace_version()
+        # TODO(jiabin): Remove this when we support inplace
+        if not core._in_eager_mode():
+            output._bump_inplace_version()
 
     return output
 
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 252482fa6d2..156fdcb9b0a 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
             self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
             ori_place = egr_tensor.place
 
-            new_arr = np.random.rand(4, 4, 16, 32).astype('float32')
+            new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
             self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
 
-            egr_tensor._set_value(new_arr)
+            egr_tensor.set_value(new_arr)
             self.assertEqual(egr_tensor.stop_gradient, True)
             self.assertTrue(egr_tensor.place._equals(ori_place))
-            self.assertEqual(egr_tensor.shape, [4, 4, 16, 32])
+            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
             self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
 
 
@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
             new_weight = np.ones([1, 3]).astype('float32')
             self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
 
-            linear.weight._set_value(new_weight)
+            linear.weight.set_value(new_weight)
             self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
             self.assertTrue(linear.weight.place._equals(ori_place))
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 3238876b894..aac8b6a99b6 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -533,12 +533,8 @@ class TestTensorRegisterHook(unittest.TestCase):
             size=[self.batch_size, self.in_size]).astype('float32')
         data_t = paddle.to_tensor(data)
 
-        if _in_eager_mode():
-            with self.assertRaises(TypeError):
-                out = jit_net(data_t)
-        else:
-            with self.assertRaises(AssertionError):
-                out = jit_net(data_t)
+        with self.assertRaises(AssertionError):
+            out = jit_net(data_t)
 
     def test_register_hook_in_dy2static_mode(self):
         with _test_eager_guard():
-- 
GitLab


From 34d93bee16ece807bc8dc4f24dbbed64ab40d8fb Mon Sep 17 00:00:00 2001
From: zhangxiaoci <zhangxiaoci@baidu.com>
Date: Thu, 3 Mar 2022 10:19:49 +0800
Subject: [PATCH 080/272] bugfix in is_xpu_support_op (#40070)

---
 paddle/fluid/platform/device/xpu/xpu_op_list.cc | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 6127fcfa8de..b20e8ac9785 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -23,12 +23,9 @@ namespace paddle {
 namespace platform {
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kl2_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kl2_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
 #ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kp_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kp_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
-- 
GitLab


From 815f7a670a459eea9213cfe46bfde47ad07c1efb Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:45:21 +0800
Subject: [PATCH 081/272] change_ASP_sharding_option (#40028)

---
 .../paddle/distributed/fleet/base/fleet_base.py  | 16 ++++++++++++++++
 python/paddle/fluid/contrib/sparsity/asp.py      |  9 +++++----
 .../asp/test_fleet_with_asp_sharding.py          |  2 +-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index bc59b87e2ff..236322ccfca 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1430,6 +1430,22 @@ class Fleet(object):
 
         # cache original feed forward program
         self.origin_main_program = loss.block.program
+        # add distributed attr
+        if not hasattr(self.origin_main_program, "distributed_info_"):
+            setattr(self.origin_main_program, "distributed_info_", dict())
+            self.origin_main_program.distributed_info_[
+                "dp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "dp_degree"]
+            self.origin_main_program.distributed_info_[
+                "mp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "mp_degree"]
+            self.origin_main_program.distributed_info_[
+                "pp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "pp_degree"]
+            self.origin_main_program.distributed_info_[
+                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
+                    "sharding_degree"]
+
         context["origin_main_program"] = self.origin_main_program
         context["loss"] = loss
         if startup_program == None:
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 937fcdf0463..ffa12ac7046 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -155,8 +155,7 @@ def prune_model(main_program=None,
                 n=2,
                 m=4,
                 mask_algo='mask_1d',
-                with_mask=True,
-                sharding=False):
+                with_mask=True):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
     specified mask generation function given by :attr:`mask_algo`. This 
@@ -179,7 +178,6 @@ def prune_model(main_program=None,
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
-        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -221,7 +219,10 @@ def prune_model(main_program=None,
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
             sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
-    if sharding:
+    if main_program is not None and hasattr(
+            main_program,
+            "distributed_info_") and main_program.distributed_info_[
+                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
         gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
         place = paddle.CUDAPlace(gpu_id)
     else:
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
index 26170015ae8..d9ddd6c88d7 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
         exe.run(startup_prog)
 
-        sparsity.prune_model(train_prog, sharding=True)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))
-- 
GitLab


From 00bbb8c59a2150e4cda68e0fae7a362e5cb663f5 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:51:17 +0800
Subject: [PATCH 082/272] [Phi] move gaussian_random (#39932)

[Phi] move gaussian_random kernel
---
 paddle/fluid/operators/gaussian_random_op.cc  |  23 ----
 paddle/fluid/operators/gaussian_random_op.cu  |  52 --------
 .../phi/kernels/cpu/gaussian_random_kernel.cc |  53 +++++++++
 paddle/phi/kernels/gaussian_random_kernel.h   |  32 +++++
 .../phi/kernels/gpu/gaussian_random_kernel.cu | 111 ++++++++++++++++++
 paddle/phi/ops/compat/gaussian_random_sig.cc  |  45 +++++++
 6 files changed, 241 insertions(+), 75 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/gaussian_random_kernel.cc
 create mode 100644 paddle/phi/kernels/gaussian_random_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/gaussian_random_kernel.cu
 create mode 100644 paddle/phi/ops/compat/gaussian_random_sig.cc

diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 774ff0bd065..6b559885c56 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -26,27 +26,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-
-    std::normal_distribution<T> dist(mean, std);
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-    int64_t size = tensor->numel();
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                              ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 21d827c7920..d419bd70e67 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -52,53 +52,6 @@ struct GaussianGenerator {
   }
 };
 
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::normal_distribution<MT> dist;
-        distribution::normal_transform<MT> trans(mean, std);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        auto func =
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
-        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-      }
-    } else {
-      auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    }
-  }
-};
-
 template <typename T>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    gaussian_random,
-    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
-    paddle::operators::GPUGaussianRandomKernel<float>,
-    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
     gaussian_random_batch_size_like,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
new file mode 100644
index 00000000000..7e336f18bf8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  std::normal_distribution<T> dist(mean, std);
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+  int64_t size = tensor->numel();
+  T* data = dev_ctx.template Alloc<T>(tensor);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h
new file mode 100644
index 00000000000..2903d80d22d
--- /dev/null
+++ b/paddle/phi/kernels/gaussian_random_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
new file mode 100644
index 00000000000..d5acc60a360
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(mean_, std_);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
+    MT out = dist(rng);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      funcs::normal_distribution<MT> dist;
+      funcs::normal_transform<MT> trans(mean, std);
+      funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func =
+          GaussianGenerator<MT>(mean, std, seed_offset.first, gen_offset);
+      IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+    }
+  } else {
+    auto func = GaussianGenerator<MT>(mean, std, seed);
+    IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc
new file mode 100644
index 00000000000..cddcb80ebea
--- /dev/null
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("ShapeTensorList") > 0) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+  if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensor", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+
+  return KernelSignature("gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
+                           phi::GaussianRandomOpArgumentMapping);
-- 
GitLab


From 3779e8077faad2f986f1c251265e82e6ab667582 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:55:00 +0800
Subject: [PATCH 083/272] move gather_tree infer shape (#40082)

---
 paddle/fluid/operators/gather_tree_op.cc | 23 ++++++++---------------
 paddle/phi/infermeta/binary.cc           | 13 +++++++++++++
 paddle/phi/infermeta/binary.h            |  4 ++++
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 2868c3697ed..7f6c82032fe 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
-
-    auto ids_dims = ctx->GetInputDim("Ids");
-    auto parents_dims = ctx->GetInputDim("Parents");
-    PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Parents) must be same with the "
-                          "shape of Input(Ids)."));
-    ctx->SetOutputDim("Out", ids_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -72,4 +61,8 @@ selected ids.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
+DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PT_INFER_META(phi::GatherTreeMeta));
+
+REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
+                  GatherTreeInferShapeFunctor);
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 675e68af743..7682f6b3d49 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index a0140c9a579..5906e06b293 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
 }  // namespace phi
-- 
GitLab


From b8a169119bbf8bffcd06fcf68e5634defbe217f8 Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Thu, 3 Mar 2022 11:13:03 +0800
Subject: [PATCH 084/272] Workqueue threadnames (#40035)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* Set thread name for WorkQueue

* Add thread names

* fix ut

Co-authored-by: liutiexing <liutiexing@google.com>
---
 .../new_executor/workqueue/nonblocking_threadpool.h       | 2 +-
 paddle/fluid/platform/init.cc                             | 3 +++
 paddle/fluid/platform/os_info_test.cc                     | 3 +--
 paddle/fluid/platform/profiler/host_event_recorder.h      | 8 ++++++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 7b3916bafc9..bc65231abe7 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -409,7 +409,7 @@ class ThreadPoolTempl {
       return false;
     }
     platform::RecordEvent("SleepWaitForWork",
-                          platform::TracerEventType::UserDefined, 2);
+                          platform::TracerEventType::UserDefined, 10);
     ec_.CommitWait(waiter);
     blocked_--;
     return true;
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 71fd0d20143..372bfbce2ac 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/place.h"
 
 #ifdef PADDLE_WITH_XPU
@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) {
 #endif
 
 void InitDevices() {
+  // set name at the entry point of Paddle
+  platform::SetCurrentThreadName("MainThread");
 // CUPTI attribute should be set before any CUDA context is created (see CUPTI
 // documentation about CUpti_ActivityAttribute).
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc
index b309bb98512..b3311f1d19e 100644
--- a/paddle/fluid/platform/os_info_test.cc
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
   using paddle::platform::GetCurrentThreadName;
   using paddle::platform::SetCurrentThreadName;
   using paddle::platform::GetAllThreadNames;
-  EXPECT_EQ("unset", GetCurrentThreadName());
-  EXPECT_TRUE(SetCurrentThreadName("MainThread"));
+  SetCurrentThreadName("MainThread");
   EXPECT_FALSE(SetCurrentThreadName("MainThread"));
   auto names = GetAllThreadNames();
   EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 49f93625275..afd41352465 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -189,7 +189,10 @@ struct ThreadEventSection {
 
 class ThreadEventRecorder {
  public:
-  ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); }
+  ThreadEventRecorder() {
+    thread_id_ = GetCurrentThreadSysId();
+    thread_name_ = GetCurrentThreadName();
+  }
 
   DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
 
@@ -202,7 +205,7 @@ class ThreadEventRecorder {
 
   ThreadEventSection GatherEvents() {
     ThreadEventSection thr_sec;
-    thr_sec.thread_name = GetCurrentThreadName();
+    thr_sec.thread_name = thread_name_;
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
     return thr_sec;
@@ -210,6 +213,7 @@ class ThreadEventRecorder {
 
  private:
   uint64_t thread_id_;
+  std::string thread_name_;
   EventContainer<CommonEvent> base_evt_cntr_;
 };
 
-- 
GitLab


From 31d3d8574149787768ef49a5f49b2e0f271dc185 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 3 Mar 2022 11:19:17 +0800
Subject: [PATCH 085/272] [PHI] Code auto-generate for Sparse API  (#40060)

* suppport sparse api in yaml

* support auto-gen code of sparse api

* do some refactor

* add unittest test_sparse_conv_api

* add unitest file

Co-authored-by: zkh2016 <zhangkaihuo@baidu.com>
---
 .gitignore                                    |   2 +
 paddle/phi/api/lib/CMakeLists.txt             |  33 +-
 paddle/phi/api/lib/api_custom_impl.cc         |   2 +-
 .../api/lib/{api_utils.h => api_gen_utils.cc} |  65 ++--
 paddle/phi/api/lib/api_gen_utils.h            |  74 +++++
 ...parse_api.cc => sparse_api_custom_impl.cc} |  29 +-
 .../sparse_api_custom_impl.h}                 |  10 +-
 paddle/phi/kernels/sparse/cpu/convolution.h   |   4 +-
 paddle/phi/tests/api/CMakeLists.txt           |   1 +
 paddle/phi/tests/api/test_sparse_conv_api.cc  | 174 +++++++++++
 python/paddle/utils/code_gen/api_base.py      |  12 +-
 python/paddle/utils/code_gen/api_gen.py       |   2 +-
 .../paddle/utils/code_gen/backward_api_gen.py |   2 +-
 python/paddle/utils/code_gen/sparse_api.yaml  |  21 ++
 .../paddle/utils/code_gen/sparse_api_gen.py   | 282 ++++++++++++++++++
 15 files changed, 644 insertions(+), 69 deletions(-)
 rename paddle/phi/api/lib/{api_utils.h => api_gen_utils.cc} (62%)
 create mode 100644 paddle/phi/api/lib/api_gen_utils.h
 rename paddle/phi/api/lib/{sparse_api.cc => sparse_api_custom_impl.cc} (86%)
 rename paddle/phi/api/{include/sparse_api.h => lib/sparse_api_custom_impl.h} (74%)
 create mode 100644 paddle/phi/tests/api/test_sparse_conv_api.cc
 create mode 100644 python/paddle/utils/code_gen/sparse_api.yaml
 create mode 100644 python/paddle/utils/code_gen/sparse_api_gen.py

diff --git a/.gitignore b/.gitignore
index debec551d9c..a2009a1ed30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/phi/api/backward/backward_api.h
 paddle/phi/api/include/api.h
+paddle/phi/api/include/sparse_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
+paddle/phi/api/lib/sparse_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 5edb83f8c3f..4f449c578ba 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
 set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
 
+# sparse api file
+set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
+set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
+set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
+set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
+set(sparse_api_header_file_tmp ${api_header_file}.tmp)
+set(sparse_api_source_file_tmp ${api_source_file}.tmp)
+
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -73,6 +81,19 @@ add_custom_command(
   DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
   VERBATIM)
 
+# generate sparse api
+add_custom_command(
+  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
+                 --api_yaml_path ${sparse_api_yaml_file}
+                 --api_header_path ${sparse_api_header_file_tmp}
+                 --api_source_path ${sparse_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
+  COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
+  VERBATIM)
+
 # generate wrapped infermeta
 add_custom_command(
   OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
@@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
 cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
-cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 
-cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
-cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
-cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
-cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
+cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
 
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 19b113838ea..fc1afb26bf4 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_gen_utils.cc
similarity index 62%
rename from paddle/phi/api/lib/api_utils.h
rename to paddle/phi/api/lib/api_gen_utils.cc
index 6c1fa97c0f5..f04e74b45fc 100644
--- a/paddle/phi/api/lib/api_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 
 namespace paddle {
 namespace experimental {
 
 /* ------------------ for input ----------------------- */
 
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
-    const Tensor& tensor) {
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
     const paddle::optional<Tensor>& tensor) {
   if (tensor) {
     return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
   return nullptr;
 }
 
-inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
     const std::vector<Tensor>& tensors) {
   auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
   pt_tensors->reserve(tensors.size());
@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
   return std::move(pt_tensors);
 }
 
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const Tensor& tensor) {
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
     const paddle::optional<Tensor>& tensor) {
   if (tensor) {
     return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
 
 /* ----------------- for infer_meta --------------------- */
 
-inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::DenseTensor&>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
@@ -79,7 +70,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
   return {paddle::none};
 }
 
-inline std::vector<phi::MetaTensor> MakeMetaTensor(
+std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<phi::DenseTensor>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
   meta_tensors.reserve(tensors.size());
@@ -89,11 +80,11 @@ inline std::vector<phi::MetaTensor> MakeMetaTensor(
   return meta_tensors;
 }
 
-inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::SelectedRows&>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
 
 /* ------------------ for output ----------------------- */
 
-inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto dense_tensor = std::make_shared<phi::DenseTensor>(
         phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
 
-inline std::vector<phi::DenseTensor*> SetKernelOutput(
-    size_t out_size, Backend backend, std::vector<Tensor>* out) {
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out) {
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
   return results;
 }
 
-inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
-                                                      Tensor* out) {
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto select_rows = std::make_shared<phi::SelectedRows>();
     out->set_impl(select_rows);
@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
   return static_cast<phi::SelectedRows*>(out->impl().get());
 }
 
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
+  if (!out->initialized()) {
+    if (type == TensorType::SPARSE_COO) {
+      auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
+          phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else if (type == TensorType::SPARSE_CSR) {
+      auto sparse_tensor =
+          std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else {
+      auto dense_tensor = std::make_shared<phi::DenseTensor>();
+      out->set_impl(dense_tensor);
+      return dense_tensor.get();
+    }
+  }
+  return out->impl().get();
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
new file mode 100644
index 00000000000..109c6e7ab71
--- /dev/null
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
+
+/* ------------------ for input ----------------------- */
+
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
+
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor);
+
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+    const std::vector<Tensor>& tensors);
+
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
+
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor);
+
+/* ----------------- for infer_meta --------------------- */
+
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
+
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor);
+
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<phi::DenseTensor>& tensors);
+
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
+
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor);
+
+/* ------------------ for output ----------------------- */
+
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
+
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out);
+
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
+
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
similarity index 86%
rename from paddle/phi/api/lib/sparse_api.cc
rename to paddle/phi/api/lib/sparse_api_custom_impl.cc
index 9e1f59c0aa7..832c19361e5 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 
 #include <memory>
 #include "glog/logging.h"
@@ -20,31 +20,14 @@ limitations under the License. */
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
-#endif
 
 namespace paddle {
 namespace experimental {
 namespace sparse {
 
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
-                                Backend backend,
-                                const int64_t sparse_dim) {
+Tensor to_sparse_coo_impl(const Tensor& x,
+                          Backend backend,
+                          const int64_t sparse_dim) {
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     return x;
   }
@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   return out;
 }
 
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     return x;
   }
@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   return out;
 }
 
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x, Backend backend) {
   if (x.layout() != phi::DataLayout::SPARSE_CSR &&
       x.layout() != phi::DataLayout::SPARSE_COO) {
     return x;
diff --git a/paddle/phi/api/include/sparse_api.h b/paddle/phi/api/lib/sparse_api_custom_impl.h
similarity index 74%
rename from paddle/phi/api/include/sparse_api.h
rename to paddle/phi/api/lib/sparse_api_custom_impl.h
index a131804cd6f..293b2cfa3d3 100644
--- a/paddle/phi/api/include/sparse_api.h
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.h
@@ -21,13 +21,13 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
-                                Backend backend,
-                                const int64_t sparse_dim);
+Tensor to_dense_impl(const Tensor& x, Backend backend);
 
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend);
+Tensor to_sparse_coo_impl(const Tensor& x,
+                          Backend backend,
+                          const int64_t sparse_dim);
 
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
 
 }  // namespace sparse
 }  // namespace experimental
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index ab2fef5320f..1031f769179 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx,
 
   f_calc_rulebook(nullptr);
   // alloc the rulebook
-  rulebook->ResizeAndAllocate({3, rulebook_len});
+  DenseTensorMeta rulebook_meta(
+      DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
+  rulebook->set_meta(rulebook_meta);
   dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
   int* rulebook_ptr = rulebook->data<int>();
   f_calc_rulebook(rulebook_ptr);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index cde085423e4..be12960d1d6 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_
 cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
 cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
 cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils)
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
new file mode 100644
index 00000000000..16d7cb66f4c
--- /dev/null
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -0,0 +1,174 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/api/include/sparse_api.h"
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+template <typename T>
+void TestConv3dBase(const std::vector<int>& indices,
+                    const std::vector<T>& features,
+                    const phi::DDim& x_dims,
+                    const std::vector<T>& kernel,
+                    const phi::DDim& kernel_dims,
+                    const std::vector<int>& correct_out_indices,
+                    const std::vector<T>& correct_out_features,
+                    const phi::DDim& correct_out_dims,
+                    const int non_zero_num,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& dilations,
+                    const float diff = 1e-3) {
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  phi::DenseTensor indices_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+
+  phi::DenseTensor features_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           {non_zero_num, in_channels},
+                           phi::DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  auto x_tensor = std::make_shared<phi::SparseCooTensor>(
+      indices_tensor, features_tensor, x_dims);
+  paddle::experimental::Tensor x(x_tensor);
+
+  auto kernel_tensor = std::make_shared<phi::DenseTensor>(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           kernel_dims,
+                           phi::DataLayout::NHWC));
+  paddle::experimental::Tensor weight(kernel_tensor);
+
+  memcpy(kernel_tensor->mutable_data<T>(paddle::platform::CPUPlace()),
+         kernel.data(),
+         kernel.size() * sizeof(T));
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    auto outs = paddle::experimental::sparse::conv3d(
+        x, weight, paddings, dilations, strides, 1);
+
+    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
+        std::get<0>(outs).impl());
+    ASSERT_EQ(correct_out_dims.size(), out->dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out->non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(
+          correct_out_features[i] - out->non_zero_elements().data<T>()[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  }
+}
+
+void TestConv3d(const std::vector<int>& indices,
+                const std::vector<float>& features,
+                const phi::DDim& x_dims,
+                const std::vector<float>& kernel,
+                const phi::DDim& kernel_dims,
+                const std::vector<int>& correct_out_indices,
+                const std::vector<float>& correct_out_features,
+                const phi::DDim& correct_out_dims,
+                const int non_zero_num,
+                const std::vector<int>& paddings,
+                const std::vector<int>& strides,
+                const std::vector<int>& dilations) {
+  // test float
+  TestConv3dBase<float>(indices,
+                        features,
+                        x_dims,
+                        kernel,
+                        kernel_dims,
+                        correct_out_indices,
+                        correct_out_features,
+                        correct_out_dims,
+                        non_zero_num,
+                        paddings,
+                        strides,
+                        dilations);
+}
+
+TEST(API, sparse_conv2d) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  const int in_channels = 1;
+  const int out_channels = 1;
+  phi::DDim x_dims = {1, 1, 5, 5, in_channels};
+  phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  phi::DDim out_dims = {1, 1, 3, 3, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
+
+  std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
+  // 3*3*3=27
+  std::vector<float> kernel = {0.65820312,
+                               0.75048828,
+                               0.21411133,
+                               0.17370605,
+                               0.85546875,
+                               0.53076172,
+                               0.28833008,
+                               0.71044922,
+                               0.00659943};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
+
+  std::vector<float> out_features = {
+      -0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index cfd817c24c7..6c07cdec2ee 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -43,7 +43,9 @@ class BaseAPI(object):
             self.is_base_api = False
             self.invoke = api_item_yaml['invoke']
         else:
-            self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta'])
+            if 'infer_meta' in api_item_yaml:
+                self.infer_meta = self.parse_infer_meta(api_item_yaml[
+                    'infer_meta'])
             self.kernel = self.parse_kernel(api_item_yaml['kernel'])
             self.support_selected_rows_kernel = False if len(self.kernel[
                 'func']) == 1 else True
@@ -182,9 +184,9 @@ class BaseAPI(object):
                 'Tensor': 'Tensor',
                 'Tensor[]': 'std::vector<Tensor>'
             }
-            if re.search(r'\(\w*\)', output_item):
+            if re.search(r'\([a-zA-Z0-9_@]*\)', output_item):
                 result = re.search(
-                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)",
+                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>[a-zA-Z0-9_@]+)\)",
                     output_item)
                 out_type = result.group('out_type')
                 assert out_type in output_type_map, \
@@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::DenseTensor&',
-            'const Tensor &': 'const phi::DenseTensor&',
             'const std::vector<Tensor>&':
             'const std::vector<phi::DenseTensor>&',
-            'const std::vector<Tensor> &':
-            'const std::vector<phi::DenseTensor>&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::DenseTensor&>',
             'const paddle::optional<std::vector<Tensor>>&':
@@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
-            'const Tensor &': 'const phi::SelectedRows&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::SelectedRows&>'
         }
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index a26630ad041..1bdfa8b6697 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -105,7 +105,7 @@ def source_include(header_file_path):
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 125ebed82de..b9f991f9b0f 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -146,7 +146,7 @@ def source_include(header_file_path):
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
new file mode 100644
index 00000000000..135989121cc
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -0,0 +1,21 @@
+- sparse_api : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
+  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  kernel :
+    func : sparse_conv3d
+    layout : x
+
+- sparse_api : to_dense
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@DenseTensor)
+  invoke : to_dense_impl(x, backend)
+
+- sparse_api : to_sparse_coo
+  args : (Tensor x, Backend backend, int64_t sparse_dim)
+  output : Tensor(out@SparseCooTensor)
+  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+
+- sparse_api : to_sparse_csr
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@SparseCsrTensor)
+  invoke : to_sparse_csr_impl(x, backend)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
new file mode 100644
index 00000000000..99c5a4f49f8
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from api_base import BaseAPI
+
+
+class SparseAPI(BaseAPI):
+    def __init__(self, api_item_yaml):
+        super(SparseAPI, self).__init__(api_item_yaml)
+
+    def get_api_name(self, api_item_yaml):
+        return api_item_yaml['sparse_api']
+
+    def get_api_func_name(self):
+        return self.api
+
+    def get_return_type(self, out_type_list):
+        return out_type_list[0] if len(
+            out_type_list) == 1 else "std::tuple<" + ",".join(
+                out_type_list) + ">"
+
+    def gene_api_declaration(self):
+        return f"""
+// {", ".join(self.outputs['names'])}
+PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+"""
+
+    def get_kernel_tensor_out_type(self, output_name):
+        sparse_type = 'TensorType::DENSE_TENSOR'
+        if output_name.endswith('@SparseCooTensor'):
+            sparse_type = 'TensorType::SPARSE_COO'
+        elif output_name.endswith('@SparseCsrTensor'):
+            sparse_type = 'TensorType::SPARSE_CSR'
+        return sparse_type
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} out{inplace_assign};
+  auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} out;"""
+
+            for i in range(len(output_type_list)):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+    def gen_sparse_kernel_context(self, kernel_output_names):
+        input_trans_map = {
+            'const Tensor&': 'const phi::TenseBase&',
+            'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::TenseBase&>'
+        }
+        out_trans_map = {
+            'Tensor': 'phi::TenseBase*',
+            'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
+        }
+        input_names = self.inputs['names']
+        input_infos = self.inputs['input_info']
+
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        kernel_context_code = ""
+        for param in kernel_param:
+            if param in input_names:
+                if param in self.optional_vars:
+                    raise ValueError(
+                        f"{self.api} : Unsupport optional input({param}) for sparse api."
+                    )
+                else:
+                    kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackInput({param}.impl().get());"""
+
+                continue
+            if param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::ScalarArray(' + param + ')'
+                elif 'Scalar' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::Scalar(' + param + ')'
+            elif isinstance(param, bool):
+                param = str(param).lower()
+            else:
+                param + str(param) + ", "
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+        for out_name in kernel_output_names:
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackOutput({out_name});"""
+
+        return kernel_context_code
+
+    def gen_sparse_kernel_code(self, inplace_flag=False):
+        _, kernel_output_names, output_create = self.gene_output(
+            self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
+
+        kernel_context_code = self.gen_sparse_kernel_context(
+            kernel_output_names)
+
+        return f"""
+  auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = phi::KernelContext(dev_ctx);
+{output_create}
+{kernel_context_code}
+  phi_kernel(&kernel_context);
+
+  return out;"""
+
+    def gene_base_api_code(self, inplace_flag=False):
+        api_func_name = self.get_api_func_name()
+        return f"""
+PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+{self.gene_kernel_select()}
+{self.gen_sparse_kernel_code(inplace_flag)}
+}}
+"""
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/data_transform.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/declarations.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/include/sparse_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_api = SparseAPI(api)
+        header_file.write(sparse_api.gene_api_declaration())
+        source_file.write(sparse_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/include/sparse_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab


From 2ffa643644241b1cecb1a0255dddbfbf1688c16c Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 3 Mar 2022 11:23:43 +0800
Subject: [PATCH 086/272] fix output var may be nullptr and cause segment fault
 bug (#40079)

---
 paddle/fluid/framework/operator.cc          | 20 ++++++++++++--------
 paddle/fluid/imperative/prepared_operator.h | 19 +++++++++++--------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8ebc64e5f2c..b68748a687c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2106,15 +2106,19 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
-      } else if (var->template IsType<phi::SelectedRows>()) {
-        tensor_out = var->template GetMutable<phi::SelectedRows>();
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported output `%s` type when call pt kernel.",
-            framework::ToTypeName(var->Type())));
+
+      if (var) {
+        if (var->template IsType<framework::LoDTensor>()) {
+          tensor_out = var->template GetMutable<framework::LoDTensor>();
+        } else if (var->template IsType<phi::SelectedRows>()) {
+          tensor_out = var->template GetMutable<phi::SelectedRows>();
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported output `%s` type when call pt kernel.",
+              framework::ToTypeName(var->Type())));
+        }
       }
+
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 3b5762720e7..30dbe07d7af 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -314,15 +314,18 @@ void BuildDygraphPhiKernelContext(
 
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset]->MutableVar();
-      if (var->template IsType<phi::DenseTensor>()) {
-        tensor_out = var->template GetMutable<phi::DenseTensor>();
-      } else if (var->template IsType<phi::SelectedRows>()) {
-        tensor_out = var->template GetMutable<phi::SelectedRows>();
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported output `%s` type when call pt kernel.",
-            framework::ToTypeName(var->Type())));
+      if (var) {
+        if (var->template IsType<phi::DenseTensor>()) {
+          tensor_out = var->template GetMutable<phi::DenseTensor>();
+        } else if (var->template IsType<phi::SelectedRows>()) {
+          tensor_out = var->template GetMutable<phi::SelectedRows>();
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported output `%s` type when call pt kernel.",
+              framework::ToTypeName(var->Type())));
+        }
       }
+
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
-- 
GitLab


From b565b349752d0917fd5ca3f118ad1c618a098db9 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 3 Mar 2022 11:44:59 +0800
Subject: [PATCH 087/272] add communication api for ProcessGroupNCCL (#40097)

---
 .../distributed/collective/ProcessGroup.h     |  29 ++++
 .../collective/ProcessGroupNCCL.cc            | 143 ++++++++++++++++++
 .../distributed/collective/ProcessGroupNCCL.h |  14 ++
 paddle/fluid/distributed/collective/Types.h   |   9 ++
 paddle/fluid/pybind/distributed_py.cc         |  57 +++++++
 .../tests/unittests/process_group_nccl.py     | 100 +++++++++++-
 6 files changed, 345 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index e4f27205202..e43d0e8c183 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -117,6 +117,35 @@ class ProcessGroup {
         "ProcessGroup%s does not support receive", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors /* tensors */,     // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in /* tensors */,     // NOLINT
+      std::vector<Tensor>& out /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors /* tensors */,  // NOLINT
+      const ReduceOptions& opts) {                 // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<Tensor>& in_tensors /* tensors */,   // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */,  // NOLINT
+      const ScatterOptions&) {                         // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Scatter", GetBackendName()));
+  }
+
  protected:
   const int rank_;
   const int size_;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 5d96e730aa4..88d8fb69eb6 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclAllGather(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), comm, stream);
+      },
+      CommType::ALLGATHER);
+}
+
+void* GetPointerByOffset(void* raw_pointer, size_t offset,
+                         experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          offset += input_tensor->numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input_tensor->data(), output_tensor->data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
+      },
+      CommType::REDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input_tensor->data(), offset, input.type()),
+                input_tensor->numel() / size_,
+                platform::ToNCCLDataType(input.type()), i, comm, stream));
+            offset += input_tensor->numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+        }
+      },
+      CommType::SCATTER);
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index cfeb6467f0d..d63a5e76838 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
                                            int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
index 699222ac452..973f7c64354 100644
--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -36,5 +36,14 @@ struct BarrierOptions {
   std::vector<int> place_ids;
 };
 
+struct ReduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+  int root_rank = 0;
+};
+
+struct ScatterOptions {
+  int root_rank = 0;
+};
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 3b5644764a5..17512863357 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -77,6 +77,11 @@ void BindDistributed(py::module *m) {
       .def(py::init<>())
       .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
 
+  py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
+      .def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
+
   auto ProcessGroup =
       py::class_<distributed::ProcessGroup,
                  std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
@@ -134,6 +139,58 @@ void BindDistributed(py::module *m) {
                  return self.Recv(tensors, src);
                },
                py::arg("tensor"), py::arg("src"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("all_gather",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllGather(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("alltoall",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllToAll(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("reduce",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  int dst, distributed::ReduceOp op) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 distributed::ReduceOptions opts;
+                 opts.reduce_op = op;
+                 opts.root_rank = dst;
+                 std::vector<Tensor> tensors = {in_tensor};
+                 return self.Reduce(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("dst"),
+               py::arg("op") = distributed::ReduceOp::SUM,
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("scatter",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor, int src) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 distributed::ScatterOptions opts;
+                 opts.root_rank = src;
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.Scatter(in_tensors, out_tensors, opts);
+               },
+               py::arg("in"), py::arg("out"), py::arg("src"),
                py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL)
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 8ec5d13c569..4833cea9a8d 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase):
 
             print("test barrier api ok\n")
 
-            # test send/recv
+            # test allgather
             # rank 0
             x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
             tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
             if pg.rank() == 0:
-                task = pg.send(tensor_x, dst=1)
+                task = pg.alltoall(tensor_x, tensor_out1)
                 task.wait()
                 paddle.device.cuda.synchronize()
             # rank 1
             else:
-                y = np.random.random(self.shape).astype(self.dtype)
-                tensor_y = paddle.to_tensor(y)
-                task = pg.recv(tensor_y, src=0)
+                task = pg.alltoall(tensor_y, tensor_out2)
                 task.wait()
                 paddle.device.cuda.synchronize()
-                assert np.array_equal(tensor_x, tensor_y)
-                print("test send/recv api ok\n")
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
 
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
-- 
GitLab


From 9f74b84eea01c9286640a8be79190a628abd9eed Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Thu, 3 Mar 2022 12:15:14 +0800
Subject: [PATCH 088/272] [phi] transfer pad kernel into phi and pass the
 test_pad_op (#40012)

* add pad forward

* fix error

* transfer pad and pass the test_pad_op
---
 paddle/fluid/operators/conv_cudnn_op.cu       | 42 +++++------
 .../operators/conv_transpose_cudnn_op.cu      | 48 +++++++------
 .../fluid/operators/fused/conv_fusion_op.cu   | 12 ++--
 paddle/fluid/operators/pad_constant_like_op.h | 12 ++--
 paddle/fluid/operators/pad_op.cc              | 39 +----------
 paddle/fluid/operators/pad_op.h               | 63 -----------------
 paddle/fluid/operators/spectral_op.h          |  7 +-
 paddle/phi/kernels/cpu/pad_grad_kernel.cc     | 28 ++++++++
 paddle/phi/kernels/cpu/pad_kernel.cc          | 30 ++++++++
 .../math => phi/kernels/funcs}/padding.h      | 70 +++++++++++--------
 paddle/phi/kernels/gpu/pad_grad_kernel.cu     | 29 ++++++++
 paddle/phi/kernels/gpu/pad_kernel.cu          | 31 ++++++++
 .../phi/kernels/impl/pad_grad_kernel_impl.h   | 33 +++++++++
 paddle/phi/kernels/impl/pad_kernel_impl.h     | 32 +++++++++
 paddle/phi/kernels/pad_grad_kernel.h          | 28 ++++++++
 paddle/phi/kernels/pad_kernel.h               | 28 ++++++++
 paddle/phi/ops/compat/pad_sig.cc              | 28 ++++++++
 17 files changed, 372 insertions(+), 188 deletions(-)
 delete mode 100644 paddle/fluid/operators/pad_op.h
 create mode 100644 paddle/phi/kernels/cpu/pad_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/pad_kernel.cc
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/padding.h (67%)
 create mode 100644 paddle/phi/kernels/gpu/pad_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/pad_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/pad_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/pad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/pad_grad_kernel.h
 create mode 100644 paddle/phi/kernels/pad_kernel.h
 create mode 100644 paddle/phi/ops/compat/pad_sig.cc

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index dff60afd74c..2055bf560e6 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -25,10 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 DECLARE_bool(cudnn_deterministic);
 DECLARE_uint64(conv_workspace_size_limit);
@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     Tensor transformed_input;
     std::vector<int> padding_common(data_dim, 0);
@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         default:
@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // cuDNN only supports padding the same amount on every dimension.
     // So we create a new padded input tensor.
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
     Tensor transformed_input(input->type());
     Tensor transformed_input_grad(input->type());
     std::vector<int> padding_common(data_dim, 0);
@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         default:
@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
     Tensor transformed_X(X->type());
     Tensor transformed_ddX(X->type());
 
@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 4b8f9d7e6ca..141a99f60f1 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
     Tensor transformed_input;
@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
         } break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
     Tensor transformed_output_grad;
@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
               &transformed_output_grad);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
               &transformed_output_grad);
         } break;
         default:
@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
     Tensor transformed_X(X->type());
     Tensor transformed_ddX(X->type());
 
@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (dO) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_dO_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_dO_channel, pad_value,
                 &transformed_dO);
           }
 
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index bb5b363fe83..5dbf4fb88b2 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     Tensor transformed_input;
     std::vector<int> padding_common(data_dim, 0);
@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         default:
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 5df167fdf72..0aedd800e1a 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/padding.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
       pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
     }
 
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
-                                           *in_y, out);
+    phi::funcs::PaddingFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, pad_value,
+        *in_y, out);
   }
 };
 
@@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
       pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
     }
 
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
-                                               d_y);
+    phi::funcs::PaddingGradFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, *in_dout,
+        d_y);
   }
 };
 
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 39acba7e58a..229e61ac9fe 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pad_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
                   ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
deleted file mode 100644
index d494c954e1e..00000000000
--- a/paddle/fluid/operators/pad_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/padding.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class PadKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    float pad_value = context.Attr<float>("pad_value");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    int rank = x->dims().size();
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
-                                           static_cast<T>(pad_value), *x, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PadGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    if (d_x == nullptr) {
-      return;
-    }
-
-    d_x->mutable_data<T>(context.GetPlace());
-    int rank = d_out->dims().size();
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
-                                               d_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index 2bc5124843c..a60ec5a4df5 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -23,9 +23,9 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
 #endif
@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
       std::vector<int> pads(rank * 2, 0);
       pads[axes.back() * 2 + 1] = zero_length;
 
-      paddle::operators::math::PaddingFunctor<DeviceContext, C>(
-          rank, ctx, pads, static_cast<C>(0), *dy, &full_dy);
+      phi::funcs::PaddingFunctor<DeviceContext, C>(
+          rank, ctx.template device_context<DeviceContext>(), pads,
+          static_cast<C>(0), *dy, &full_dy);
       fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
                    !forward);
     }
diff --git a/paddle/phi/kernels/cpu/pad_grad_kernel.cc b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
new file mode 100644
index 00000000000..67e6da7d0e0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/pad_kernel.cc b/paddle/phi/kernels/cpu/pad_kernel.cc
new file mode 100644
index 00000000000..f4a0acdcca2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/phi/kernels/funcs/padding.h
similarity index 67%
rename from paddle/fluid/operators/math/padding.h
rename to paddle/phi/kernels/funcs/padding.h
index 529d39c9ba5..6d10ff2dfcf 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,21 +15,26 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using EigenTensor = EigenTensor<T, D, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context,
-                 const std::vector<int>& pads, const framework::Tensor& src,
-                 T pad_value, framework::Tensor* out) {
+void PadFunction(const DeviceContext& context,
+                 const std::vector<int>& pads,
+                 const DenseTensor& src,
+                 T pad_value,
+                 DenseTensor* out) {
   std::array<std::pair<int64_t, int64_t>, D> paddings;
 
   for (size_t i = 0; i < paddings.size(); ++i) {
@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context,
   auto src_tensor = EigenTensor<T, D>::From(src);
   auto out_tensor = EigenTensor<T, D>::From(*out);
 
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *(context.eigen_device());
   EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
       place, out_tensor, src_tensor, paddings, pad_value);
 }
 
 template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context,
-                     const std::vector<int>& pads, const framework::Tensor& src,
-                     framework::Tensor* d_out) {
+void PadGradFunction(const DeviceContext& context,
+                     const std::vector<int>& pads,
+                     const DenseTensor& src,
+                     DenseTensor* d_out) {
   std::array<std::pair<int64_t, int64_t>, D> paddings;
   for (size_t i = 0; i < paddings.size(); ++i) {
     paddings[i].first = -pads[i * 2];
@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context,
 
   auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
   auto src_tensor = EigenTensor<T, D>::From(src);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *(context.eigen_device());
   EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
       place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
 }
 
 template <typename DeviceContext, typename T>
-void PaddingFunctor(int rank, const framework::ExecutionContext& context,
-                    const std::vector<int>& pads, T pad_value,
-                    const framework::Tensor& src, framework::Tensor* out) {
+void PaddingFunctor(int rank,
+                    const DeviceContext& context,
+                    const std::vector<int>& pads,
+                    T pad_value,
+                    const DenseTensor& src,
+                    DenseTensor* out) {
   switch (rank) {
     case 1:
       PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
       PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "PadOp only support tensors with no more"
-          " than 6 dimensions currently."));
+      PADDLE_THROW(
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
+                                     " than 6 dimensions currently."));
   }
 }
 
 template <typename DeviceContext, typename T>
-void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
+void PaddingGradFunctor(int rank,
+                        const DeviceContext& context,
                         const std::vector<int>& pads,
-                        const framework::Tensor& src, framework::Tensor* out) {
+                        const DenseTensor& src,
+                        DenseTensor* out) {
   switch (rank) {
     case 1:
       PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
@@ -118,9 +127,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
       PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "PadOp only support tensors with no more"
-          " than 6 dimensions currently."));
+      PADDLE_THROW(
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
+                                     " than 6 dimensions currently."));
   }
 }
 
@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads,
   }
   return is_sys_pad;
 }
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
new file mode 100644
index 00000000000..a25472d122b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu
new file mode 100644
index 00000000000..2b77a5f1aeb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+#include "paddle/phi/kernels/pad_kernel.h"
+
+PD_REGISTER_KERNEL(pad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/pad_grad_kernel_impl.h b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
new file mode 100644
index 00000000000..91f198f9fb6
--- /dev/null
+++ b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x) {
+  if (d_x == nullptr) {
+    return;
+  }
+  dev_ctx.template Alloc<T>(d_x);
+  int rank = d_out.dims().size();
+  phi::funcs::PaddingGradFunctor<Context, T>(
+      rank, dev_ctx, paddings, d_out, d_x);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h
new file mode 100644
index 00000000000..8e3ebb0dfe0
--- /dev/null
+++ b/paddle/phi/kernels/impl/pad_kernel_impl.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  int rank = x.dims().size();
+  funcs::PaddingFunctor<Context, T>(
+      rank, dev_ctx, paddings, static_cast<T>(pad_value), x, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad_grad_kernel.h b/paddle/phi/kernels/pad_grad_kernel.h
new file mode 100644
index 00000000000..f39d87e5c0e
--- /dev/null
+++ b/paddle/phi/kernels/pad_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad_kernel.h b/paddle/phi/kernels/pad_kernel.h
new file mode 100644
index 00000000000..511e8cf73df
--- /dev/null
+++ b/paddle/phi/kernels/pad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/phi/ops/compat/pad_sig.cc
new file mode 100644
index 00000000000..4eadbfa98be
--- /dev/null
+++ b/paddle/phi/ops/compat/pad_sig.cc
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pad_grad",
+                         {GradVarName("Out")},
+                         {"paddings", "pad_value"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping);
-- 
GitLab


From 3e56e8167f634e67005b864ad56970bcc6cc3048 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Thu, 3 Mar 2022 13:03:44 +0800
Subject: [PATCH 089/272] Add support of int16 for gather op. (#40052)

* add support of int16 for gather op.

* Recover formats.

* Recover formats.

* fix.

* Fix format.

* Fix format.
---
 paddle/fluid/operators/gather_op.cu  | 8 ++++++++
 python/paddle/tensor/manipulation.py | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 8f1d9284c50..e0db2f26d3e 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
         axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
       } else if (axis_type == framework::proto::VarType::INT64) {
         axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
+      } else if (axis_type == framework::proto::VarType::INT16) {
+        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
       }
     }
     const auto &place = ctx.GetPlace();
@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
       } else if (index_type == framework::proto::VarType::INT64) {
         phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
                                                      dev_ctx);
+      } else if (index_type == framework::proto::VarType::INT16) {
+        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
+                                                     dev_ctx);
       }
       return;
     }
@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
       phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
     } else if (index_type == framework::proto::VarType::INT64) {
       phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
+    } else if (index_type == framework::proto::VarType::INT16) {
+      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
                         ops::GatherOpCUDAKernel<int>,
+                        ops::GatherOpCUDAKernel<int16_t>,
                         ops::GatherOpCUDAKernel<plat::float16>,
                         ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index fbd6197c1b9..32ccecbc6d9 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None):
         return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
 
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        x, 'x',
+        ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'gather')
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
 
-- 
GitLab


From 5d9e11a4ce1cec37ab7dfbd6a044b1baf90bae22 Mon Sep 17 00:00:00 2001
From: huangxu96 <46740794+huangxu96@users.noreply.github.com>
Date: Thu, 3 Mar 2022 13:33:46 +0800
Subject: [PATCH 090/272] Modified sigmoid by the elementwise interface.
 (#39898)

* Modified sigmoid by elementwise interface.

* using TensorReduceImpl to repalce Sum function

* using reduceimpl to calculate the norm variable

* Removed useless code
---
 .../sigmoid_cross_entropy_with_logits_op.cu   | 207 ++++++++++++------
 1 file changed, 139 insertions(+), 68 deletions(-)

diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 40476d5e11f..18402d908c4 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -20,9 +20,11 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace paddle {
 namespace operators {
@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) {
 }
 
 template <typename T>
-__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
-                                  const int ignore_index, const int limit,
-                                  T *out_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
+struct NonzeroFunctor {
+  HOSTDEVICE explicit inline NonzeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T x) const {
+    return static_cast<T>(static_cast<double>(x) != 0);
+  }
+};
+
+template <typename T>
+struct SigmoidFwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
+    T counts;
+    T out_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
     if ((diff > -eps) && (diff < eps)) {
-      out_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
+      out_data = static_cast<T>(0.);
+      counts = 0;
     } else {
       T term1 = (x > 0) ? x : 0;
       T term2 = x * label;
       T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
-      out_data[i] = term1 - term2 + term3;
-      counts[i] = 1;
+
+      out_data = term1 - term2 + term3;
+      counts = 1;
     }
-  }
-}
+    phi::Array<T, 2> outs;
 
-template <typename T, int BlockDim>
-__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
-  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T in = 0;
-  for (int i = threadIdx.x; i < num; i += BlockDim) {
-    in += counts[i];
+    outs[0] = out_data;
+    outs[1] = counts;
+    return outs;
   }
-  __syncthreads();
-  auto out =
-      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    T a = out > eps ? out : eps;
-    sum[0] = a;
-  }
-}
+};
 
 template <typename T>
-__global__ void Div(T *loss, const int num, const T *norm) {
-  CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
-}
+struct SigmoidBwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
 
-template <typename T>
-__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
-                                   const int ignore_index, const T *dout_data,
-                                   const int limit, T *dx_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T dout = dout_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
+  HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label,
+                                                const T dout) {
+    T counts;
+    T dx_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
     if ((diff > -eps) && (diff < eps)) {
-      dx_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
+      dx_data = static_cast<T>(0.);
+      counts = 0;
     } else {
       T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
       T diff = simoid_x - label;
-      dx_data[i] = dout * diff;
-      counts[i] = 1;
+      dx_data = dout * diff;
+      counts = 1;
     }
+    phi::Array<T, 2> outs;
+
+    outs[0] = dx_data;
+    outs[1] = counts;
+    return outs;
   }
-}
+};
+
+template <typename T>
+struct DivFunctor {
+  const T norm_;
+  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
+
+  HOSTDEVICE inline T operator()(T loss) {
+    loss /= norm_;
+    return loss;
+  }
+};
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     bool normalize = context.Attr<bool>("normalize");
 
     // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
+    Tensor *counts_tensor = new Tensor();
+    counts_tensor->mutable_data<T>(context.GetPlace(),
+                                   Labels->numel() * sizeof(T));
+    counts_tensor->Resize(Out->dims());
     int limit = Out->numel();
     int blocks = NumBlocks(limit);
     int threads = kNumCUDAThreads;
-    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
+    std::vector<const framework::Tensor *> ins = {X, Labels};
+    std::vector<framework::Tensor *> outs = {Out, counts_tensor};
+    auto functor = SigmoidFwdFunctor<T>(ignore_index);
+    constexpr int Size = 2;
+    phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
+                                                              &outs, functor);
     if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
+      T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
+      Tensor *norm_tensor = new Tensor();
+      norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
+      auto dims = phi::vectorize(counts_tensor->dims());
+      std::vector<int> reduce_dim = {};
+      for (int i = 0; i < dims.size(); i++) {
+        reduce_dim.push_back(i);
+      }
+
+      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+          context.cuda_device_context(), *counts_tensor, norm_tensor,
+          NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
+      T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
+      auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
+      T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+      memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
+                   sizeof(T), dev_ctx.stream());
+      auto eps = static_cast<T>(1e-5);
+      *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+      std::vector<const framework::Tensor *> div_ins = {Out};
+      std::vector<framework::Tensor *> div_outs = {Out};
+      auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+      phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
+                                       div_functor);
+
+      delete norm_tensor;
+      delete counts_tensor;
     }
   }
 };
@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel
 
     auto &dev_ctx = context.cuda_device_context();
     // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+    Tensor *counts_tensor = new Tensor();
+    counts_tensor->mutable_data<T>(context.GetPlace(),
+                                   Labels->numel() * sizeof(T));
+    counts_tensor->Resize(dX->dims());
 
     int limit = dX->numel();
     int blocks = NumBlocks(limit);
     int threads = kNumCUDAThreads;
-    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
-        dx_data, counts);
+    std::vector<const framework::Tensor *> ins = {X, Labels, dOut};
+    std::vector<framework::Tensor *> outs = {dX, counts_tensor};
+    auto functor = SigmoidBwdFunctor<T>(ignore_index);
+    constexpr int Size = 2;
+    phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
+                                                              &outs, functor);
     bool normalize = context.Attr<bool>("normalize");
     if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
+      T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
+      Tensor *norm_tensor = new Tensor();
+      norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
+      auto dims = phi::vectorize(counts_tensor->dims());
+      std::vector<int> reduce_dim = {};
+      for (int i = 0; i < dims.size(); i++) {
+        reduce_dim.push_back(i);
+      }
+
+      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+          context.cuda_device_context(), *counts_tensor, norm_tensor,
+          NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
+      T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
+      auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
+      T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+      memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
+                   sizeof(T), dev_ctx.stream());
+      auto eps = static_cast<T>(1e-5);
+      *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+      std::vector<const framework::Tensor *> div_ins = {dX};
+      std::vector<framework::Tensor *> div_outs = {dX};
+      auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+      phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
+                                       div_functor);
+      delete norm_tensor;
     }
   }
 };
-- 
GitLab


From c3f3643b26a5bf62e4dfea0d694c15d0cb397af9 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Thu, 3 Mar 2022 13:56:16 +0800
Subject: [PATCH 091/272] EmbEltwiseLayernorm fix (#40015)

* emb fix

* fix trt6 compile

* fix half

* absolute error fix
---
 paddle/fluid/inference/tensorrt/engine.h      |  2 ++
 .../operators/tensorrt/tensorrt_engine_op.h   | 36 ++++++++++++++++---
 .../test_trt_convert_emb_eltwise_layernorm.py | 16 ++-------
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b2764ca61c1..d53a8923af6 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -54,6 +54,8 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
       return TRT_DT::kFLOAT;
     case FluidDT::VarType_Type_INT32:
       return TRT_DT::kINT32;
+    case FluidDT::VarType_Type_FP16:
+      return TRT_DT::kHALF;
     default:
       return TRT_DT::kINT32;
   }
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index e05b4de6521..0a71875d893 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -79,6 +79,28 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
           model_input_shape_str, runtime_input_shape_str));
 }
 
+static paddle::experimental::DataType TRT2FluidDataType(
+    nvinfer1::DataType type) {
+  switch (type) {
+    case nvinfer1::DataType::kFLOAT:
+      return paddle::experimental::DataType::FLOAT32;
+    case nvinfer1::DataType::kINT32:
+      return paddle::experimental::DataType::INT32;
+    case nvinfer1::DataType::kHALF:
+      return paddle::experimental::DataType::FLOAT16;
+    case nvinfer1::DataType::kINT8:
+      return paddle::experimental::DataType::INT8;
+#if IS_TRT_VERSION_GE(7000)
+    case nvinfer1::DataType::kBOOL:
+      return paddle::experimental::DataType::BOOL;
+#endif
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unknown fluid datatype in Fluid op converter"));
+      return paddle::experimental::DataType::FLOAT32;
+  }
+}
+
 static void RuntimeDynamicShapeCheck(
     const std::string &x, const std::vector<int32_t> &runtime_input_shape,
     const std::vector<int32_t> &min_input_shape,
@@ -520,9 +542,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
         buffers[bind_index] = static_cast<void *>(t.data<int64_t>());
       } else if (type == framework::proto::VarType::INT32) {
         buffers[bind_index] = static_cast<void *>(t.data<int32_t>());
+      } else if (type == framework::proto::VarType::FP16) {
+        buffers[bind_index] = static_cast<void *>(t.data<float16>());
       } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "The TRT Engine OP only support float/int32_t/int64_t input."));
+        PADDLE_THROW(
+            platform::errors::Fatal("The TRT Engine OP only support "
+                                    "float/int32_t/int64_t/float16 input."));
       }
     }
 
@@ -570,9 +595,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
                             "than the number of bindings, but got binding "
                             "index = %d, number of bindings = %d.",
                             bind_index, num_bindings));
-      buffers[bind_index] =
-          static_cast<void *>(fluid_t->mutable_data<float>(dev_place));
-
+      auto trt_type = engine->engine()->getBindingDataType(bind_index);
+      // get adr and set type
+      buffers[bind_index] = static_cast<void *>(
+          fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type)));
       output_index += 1;
     }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
index 356a2c942df..1eecf9c0497 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
@@ -244,28 +244,16 @@ class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 5), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 5), 1e-5
+        yield self.create_inference_config(), (0, 5), 2e-2
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 4), 1e-5
-
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Half and len(
-                    self.dynamic_shape.min_input_shape) != 0:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt when dynamic fp16 mode.")
+        yield self.create_inference_config(), (1, 4), 2e-2
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
-- 
GitLab


From cac00e0bba6b189f21207fde89e27f682913e32f Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 3 Mar 2022 14:01:58 +0800
Subject: [PATCH 092/272] [Phi]Delete kernel registry of elementwise_sub op in
 Fluid (#40039)

* delete elementwise_sub kernel registry

* fix compile bugs in xpu ci

* fix bugs when run inference ci
---
 .../elementwise/elementwise_op_npu_test.cc    |  2 +-
 .../elementwise/elementwise_pow_op_xpu.cc     |  1 -
 .../elementwise/elementwise_sub_op.cc         | 55 ++---------
 .../elementwise/elementwise_sub_op.cu         | 63 ------------
 .../elementwise/elementwise_sub_op.h          | 96 -------------------
 .../elementwise/elementwise_sub_op_npu.cc     |  2 +-
 .../elementwise/elementwise_sub_op_xpu.cc     |  1 -
 paddle/phi/kernels/math_kernel.cc             |  3 +-
 paddle/phi/ops/compat/elementwise_sig.cc      |  9 ++
 9 files changed, 20 insertions(+), 212 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.cu
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.h

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 1f8a95f0286..fc128a88f20 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -33,7 +33,7 @@ namespace p = paddle::platform;
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP(elementwise_sub);
+USE_OP_ITSELF(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
index 14b20baae1b..78855dd3957 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index b2cef95d1a3..d15a7c27275 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -78,10 +76,16 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub);
 
 namespace ops = paddle::operators;
 
+REGISTER_OPERATOR(elementwise_sub, ::paddle::operators::ElementwiseOp,
+                  ::paddle::operators::ElementwiseSubOpMaker,
+                  ::paddle::operators::ElementwiseOpInferVarType,
+                  elementwise_subGradMaker<::paddle::framework::OpDesc>,
+                  elementwise_subGradMaker<::paddle::imperative::OpBase>,
+                  ::paddle::operators::ElementwiseOpInplaceInferer);
+
 REGISTER_OPERATOR(
     elementwise_sub_grad, ops::ElementwiseOpGrad,
     ops::ElementwiseGradOpInplaceInferer, ops::ElementwiseGradNoBufVarsInferer,
@@ -92,51 +96,6 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad,
                   ops::ElementwiseDoubleGradOpInplaceInferer,
                   ops::ElementwiseDoubleGradNoBufVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int16_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_sub)
     .AddCheckpoint(
         R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
deleted file mode 100644
index 2c962af9877..00000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
deleted file mode 100644
index 15c547b493a..00000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    phi::SubtractRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, *y, axis, z);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    phi::SubtractGradKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, *y, *dout, axis, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-    int axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    paddle::optional<const phi::DenseTensor&> ddx_optional = paddle::none;
-    paddle::optional<const phi::DenseTensor&> ddy_optional = paddle::none;
-    if (ddx != nullptr) {
-      ddx_optional = *ddx;
-    }
-    if (ddy != nullptr) {
-      ddy_optional = *ddy;
-    }
-    phi::SubtractDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *y, ddx_optional, ddy_optional, *dout, axis, ddout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index b68d38d6df1..4169a938f2d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
index d12c6fc30ce..87c494b0e10 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "xpu/refactor/math.h"
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 480eb56c8b0..8b17d8bd250 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -197,7 +197,8 @@ PD_REGISTER_KERNEL(subtract,
                    int64_t,
                    phi::dtype::float16,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(divide,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index cddebcbce12..89846ea0563 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -100,6 +100,12 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -110,6 +116,7 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -127,3 +134,5 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
                            phi::ElementwiseAddTripleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad,
+                           phi::ElementwiseSubDoubleGradOpArgumentMapping);
-- 
GitLab


From 831b69d95f975cd20bb227d3ad193c9ba180dbd3 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Thu, 3 Mar 2022 14:08:47 +0800
Subject: [PATCH 093/272] reduce size of max_input_shape so that the ut can
 pass on win6 (#40088)

---
 .../tests/unittests/ir/inference/test_trt_convert_gather.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 9bcbbf95990..852bb2ffa84 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -138,7 +138,7 @@ class TrtConvertGatherTest(TrtLayerAutoScanTest):
                     "index_data": [1]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data": [128, 256, 128, 256],
+                    "input_data": [128, 256, 64, 128],
                     "index_data": [4]
                 }
                 self.dynamic_shape.opt_input_shape = {
-- 
GitLab


From 756af9fff53245d264b7cc550e88e4360b9750e9 Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Thu, 3 Mar 2022 14:11:42 +0800
Subject: [PATCH 094/272] modify infershape of multiclass nms (#40059)

* modify infershape of multiclass nms
---
 paddle/fluid/operators/detection/multiclass_nms_op.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 7927410ef37..83cf6e5fd30 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -93,7 +93,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
     if (score_size == 3) {
-      ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     } else {
       ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     }
@@ -545,11 +545,10 @@ class MultiClassNMS2Op : public MultiClassNMSOp {
   void InferShape(framework::InferShapeContext* ctx) const override {
     MultiClassNMSOp::InferShape(ctx);
 
-    auto box_dims = ctx->GetInputDim("BBoxes");
     auto score_dims = ctx->GetInputDim("Scores");
     auto score_size = score_dims.size();
     if (score_size == 3) {
-      ctx->SetOutputDim("Index", {box_dims[1], 1});
+      ctx->SetOutputDim("Index", {-1, 1});
     } else {
       ctx->SetOutputDim("Index", {-1, 1});
     }
-- 
GitLab


From b4665d23a766627965328e2adcbe167072c3d197 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Thu, 3 Mar 2022 14:26:49 +0800
Subject: [PATCH 095/272] [CustomRuntime] migrate CustomRuntime into phi
 (#39908)

---
 paddle/fluid/framework/CMakeLists.txt         |   3 +-
 paddle/fluid/framework/custom_kernel.cc       |  47 -----
 paddle/fluid/framework/custom_kernel.h        |  26 ---
 paddle/fluid/framework/garbage_collector.cc   |  10 +-
 paddle/fluid/framework/garbage_collector.h    |   6 +-
 paddle/fluid/framework/operator.cc            |   2 +-
 paddle/fluid/imperative/tracer.cc             |   2 +-
 paddle/fluid/inference/api/CMakeLists.txt     |   2 +-
 .../memory/allocation/allocator_facade.cc     |  15 +-
 .../memory/allocation/custom_allocator.cc     |   7 +-
 .../allocation/naive_best_fit_allocator.cc    |  17 +-
 paddle/fluid/memory/detail/buddy_allocator.cc |   4 +-
 .../fluid/memory/detail/system_allocator.cc   |   6 +-
 paddle/fluid/memory/memcpy.cc                 |  20 +--
 paddle/fluid/platform/CMakeLists.txt          |   2 +-
 paddle/fluid/platform/device/CMakeLists.txt   |  20 ---
 .../platform/device/custom/CMakeLists.txt     |   4 -
 .../platform/device/custom/enforce_custom.h   |   5 +-
 paddle/fluid/platform/device/device_wrapper.h |  10 +-
 paddle/fluid/platform/device_context.cc       |   2 +-
 paddle/fluid/platform/device_context.h        |   6 +-
 paddle/fluid/platform/init.cc                 |  12 +-
 paddle/fluid/pybind/pybind.cc                 |  14 +-
 paddle/fluid/pybind/tensor_py.h               |   4 +-
 paddle/phi/backends/CMakeLists.txt            |   7 +
 .../backends}/callback_manager.cc             |  12 +-
 .../backends}/callback_manager.h              |   6 +-
 paddle/phi/backends/custom/CMakeLists.txt     |   2 +
 paddle/phi/backends/custom/custom_context.cc  |  10 +-
 .../backends}/custom/custom_device.cc         | 167 +++++++++++-------
 .../backends}/custom/custom_device_test.cc    |  30 ++--
 .../backends}/custom/fake_cpu_device.h        |  22 ++-
 .../device => phi/backends}/device_base.cc    |  88 +++++----
 .../device => phi/backends}/device_base.h     |  44 +++--
 .../device => phi/backends}/device_ext.h      |  89 +++++++---
 .../device => phi/backends}/device_guard.cc   |   8 +-
 .../device => phi/backends}/device_guard.h    |  12 +-
 .../device => phi/backends}/device_manager.cc | 100 ++++++-----
 .../device => phi/backends}/device_manager.h  |  43 +++--
 .../platform/device => phi/backends}/event.cc |  14 +-
 .../platform/device => phi/backends}/event.h  |   6 +-
 .../device => phi/backends}/stream.cc         |  19 +-
 .../platform/device => phi/backends}/stream.h |  11 +-
 paddle/phi/core/CMakeLists.txt                |   2 +-
 paddle/phi/core/compat/convert_utils.cc       |   6 +-
 paddle/phi/core/custom_kernel.cc              |  24 +++
 paddle/phi/core/custom_kernel.h               |   2 +
 python/setup.py.in                            |   5 +-
 48 files changed, 513 insertions(+), 462 deletions(-)
 delete mode 100644 paddle/fluid/framework/custom_kernel.cc
 delete mode 100644 paddle/fluid/framework/custom_kernel.h
 delete mode 100644 paddle/fluid/platform/device/custom/CMakeLists.txt
 rename paddle/{fluid/platform/device => phi/backends}/callback_manager.cc (84%)
 rename paddle/{fluid/platform/device => phi/backends}/callback_manager.h (94%)
 rename paddle/{fluid/platform/device => phi/backends}/custom/custom_device.cc (81%)
 rename paddle/{fluid/platform/device => phi/backends}/custom/custom_device_test.cc (86%)
 rename paddle/{fluid/platform/device => phi/backends}/custom/fake_cpu_device.h (90%)
 rename paddle/{fluid/platform/device => phi/backends}/device_base.cc (68%)
 rename paddle/{fluid/platform/device => phi/backends}/device_base.h (80%)
 rename paddle/{fluid/platform/device => phi/backends}/device_ext.h (78%)
 rename paddle/{fluid/platform/device => phi/backends}/device_guard.cc (83%)
 rename paddle/{fluid/platform/device => phi/backends}/device_guard.h (82%)
 rename paddle/{fluid/platform/device => phi/backends}/device_manager.cc (83%)
 rename paddle/{fluid/platform/device => phi/backends}/device_manager.h (83%)
 rename paddle/{fluid/platform/device => phi/backends}/event.cc (84%)
 rename paddle/{fluid/platform/device => phi/backends}/event.h (94%)
 rename paddle/{fluid/platform/device => phi/backends}/stream.cc (84%)
 rename paddle/{fluid/platform/device => phi/backends}/stream.h (89%)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 02d90b9c6da..e486799495c 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -440,11 +440,10 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
deleted file mode 100644
index 49a1e0774a6..00000000000
--- a/paddle/fluid/framework/custom_kernel.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include "paddle/fluid/framework/custom_kernel.h"
-#include "paddle/phi/core/custom_kernel.h"
-
-namespace paddle {
-namespace framework {
-
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
-#ifdef _LINUX
-  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
-  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
-      dlsym(dso_handle, "PD_GetCustomKernelMap"));
-
-  if (func == nullptr) {
-    LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetCustomKernelMap symbol in this lib.";
-    return;
-  }
-  auto& custom_kernel_map = func();
-  phi::RegisterCustomKernels(custom_kernel_map);
-  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
-#else
-  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
-#endif
-  return;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h
deleted file mode 100644
index 31084a34413..00000000000
--- a/paddle/fluid/framework/custom_kernel.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-
-// Load custom kernel lib and register
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 9f2bdeffecf..c1f8041cc1e 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -231,19 +231,19 @@ void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
 CustomStreamGarbageCollector::CustomStreamGarbageCollector(
     const platform::CustomPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {
-  platform::DeviceGuard guard(place);
-  stream_.reset(new platform::stream::Stream);
+  phi::DeviceGuard guard(place);
+  stream_.reset(new phi::stream::Stream);
   stream_->Init(place);
-  callback_manager_.reset(new platform::CallbackManager(stream_.get()));
+  callback_manager_.reset(new phi::CallbackManager(stream_.get()));
 }
 
 CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
-  platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
+  phi::DeviceGuard guard(this->dev_ctx_->GetPlace());
   stream_->Synchronize();
   stream_->Destroy();
 }
 
-platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
+phi::stream::Stream *CustomStreamGarbageCollector::stream() const {
   return stream_.get();
 }
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index a67860c6087..f0027c67605 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -230,14 +230,14 @@ class CustomStreamGarbageCollector : public GarbageCollector {
 
   void Wait() const override;
 
-  platform::stream::Stream *stream() const;
+  phi::stream::Stream *stream() const;
 
  protected:
   void ClearCallback(const std::function<void()> &callback) override;
 
  private:
-  std::unique_ptr<platform::stream::Stream> stream_;
-  std::unique_ptr<platform::CallbackManager> callback_manager_;
+  std::unique_ptr<phi::stream::Stream> stream_;
+  std::unique_ptr<phi::CallbackManager> callback_manager_;
 };
 #endif
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b68748a687c..eff6d9a9102 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -254,7 +254,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with CustomDevice support.",
           place));
 #else
-      platform::DeviceManager::SetDevice(place);
+      phi::DeviceManager::SetDevice(place);
 #endif
     }
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 85bcbd1458f..4336a5c77c1 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -253,7 +253,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 #endif
     } else if (platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-      platform::DeviceManager::SetDevice(place);
+      phi::DeviceManager::SetDevice(place);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with CustomDevice if use "
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 87efe5ec519..6eeb5d64253 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
-    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel)
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel)
 
 if(WITH_CRYPTO)
     list(APPEND paddle_inference_api_deps paddle_crypto)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4d0e4852851..6b7828236a8 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -193,10 +193,10 @@ class AllocatorFacadePrivate {
         }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
         for (const auto& dev_type : device_types) {
           for (size_t dev_id = 0;
-               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
                ++dev_id) {
             InitNaiveBestFitCustomDeviceAllocator(
                 platform::CustomPlace(dev_type, dev_id));
@@ -240,10 +240,10 @@ class AllocatorFacadePrivate {
         }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
         for (const auto& dev_type : device_types) {
           for (size_t dev_id = 0;
-               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
                ++dev_id) {
             InitAutoGrowthCustomDeviceAllocator(
                 platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
@@ -738,7 +738,7 @@ class AllocatorFacadePrivate {
     auto custom_allocator =
         std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
+        custom_allocator, phi::DeviceManager::GetMinChunkSize(p),
         allow_free_idle_chunk);
   }
 #endif
@@ -814,11 +814,10 @@ class AllocatorFacadePrivate {
     }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
     for (const auto& dev_type : device_types) {
       for (size_t dev_id = 0;
-           dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
-           dev_id++) {
+           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
         places.emplace_back(platform::CustomPlace(dev_type, dev_id));
       }
     }
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index bd52c8f4ad2..e53d7b1cc76 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -32,17 +32,16 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) {
 }
 
 phi::Allocation* CustomAllocator::AllocateImpl(size_t size) {
-  std::call_once(once_flag_,
-                 [this] { platform::DeviceManager::SetDevice(place_); });
+  std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); });
 
   void* ptr =
-      platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
+      phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
   if (LIKELY(ptr)) {
     return new Allocation(ptr, size, place_);
   }
 
   size_t avail, total;
-  platform::DeviceManager::MemoryStats(place_, &total, &avail);
+  phi::DeviceManager::MemoryStats(place_, &total, &avail);
 
   auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
   auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index ea6d7019be6..0bfbe2c6962 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -739,7 +739,7 @@ class BuddyAllocatorList {
  private:
   explicit BuddyAllocatorList(const std::string &device_type)
       : device_type_(device_type) {
-    auto devices = platform::DeviceManager::GetDeviceList(device_type);
+    auto devices = phi::DeviceManager::GetDeviceList(device_type);
     for (auto dev_id : devices) {
       init_flags_[dev_id].reset(new std::once_flag());
     }
@@ -766,15 +766,15 @@ class BuddyAllocatorList {
                           device_type_, dev_id));
 
     std::call_once(*init_flags_[dev_id], [this, dev_id] {
-      platform::DeviceManager::SetDevice(device_type_, dev_id);
+      phi::DeviceManager::SetDevice(device_type_, dev_id);
       platform::CustomPlace place(device_type_, dev_id);
 
       allocators_[dev_id].reset(new BuddyAllocator(
           std::unique_ptr<detail::SystemAllocator>(
               new detail::CustomAllocator(device_type_, dev_id)),
-          platform::DeviceManager::GetMinChunkSize(place),
-          platform::DeviceManager::GetMaxChunkSize(place),
-          platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetMinChunkSize(place),
+          phi::DeviceManager::GetMaxChunkSize(place),
+          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
     });
 
     return allocators_[dev_id].get();
@@ -808,9 +808,9 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
   auto *ptr = buddy_allocator->Alloc(size);
 
   if (ptr == nullptr) {
-    platform::DeviceGuard guard(place);
+    phi::DeviceGuard guard(place);
     size_t avail, total;
-    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    phi::DeviceManager::MemoryStats(place, &total, &avail);
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
         "%s. ",
@@ -819,8 +819,7 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
         string::HumanReadableSize(total - avail)));
   } else {
     if (FLAGS_init_allocated_mem) {
-      platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
-                                                                    size);
+      phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size);
     }
   }
   VLOG(10) << "  pointer=" << ptr;
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index d7bbfba932c..cdaa2b7b1df 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -43,11 +43,11 @@ BuddyAllocator::BuddyAllocator(
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (!dev_type.empty()) {
     init_allocate_size_func_ = [dev_type]() {
-      return platform::DeviceManager::GetInitAllocSize(
+      return phi::DeviceManager::GetInitAllocSize(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
     re_allocate_size_func_ = [dev_type]() {
-      return platform::DeviceManager::GetReallocSize(
+      return phi::DeviceManager::GetReallocSize(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
   } else {
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index a61f98c4e1a..37ac0b44832 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -438,7 +438,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
   void* p;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
-  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   p = device->MemoryAllocate(size);
   if (LIKELY(p)) {
     VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
@@ -447,7 +447,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
   } else {
     size_t avail, total;
 
-    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    phi::DeviceManager::MemoryStats(place, &total, &avail);
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on %s %d. "
         "total memory is %s, used memory is %s, "
@@ -470,7 +470,7 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) {
                         size, plug_alloc_size));
   plug_alloc_size -= size;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
-  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   device->MemoryDeallocate(p, size);
 }
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 166cdd0b5d6..3198b4f8d93 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -44,9 +44,9 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << ", stream=" << stream;
 
-  platform::DeviceManager::SetDevice(src_place);
-  platform::stream::Stream stream_wrapper(src_place, stream);
-  platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
+  phi::DeviceManager::SetDevice(src_place);
+  phi::stream::Stream stream_wrapper(src_place, stream);
+  phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
       dst, src, num, &stream_wrapper);
 }
 
@@ -62,9 +62,9 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << ", stream=" << stream;
 
-  platform::DeviceManager::SetDevice(dst_place);
-  platform::stream::Stream stream_wrapper(dst_place, stream);
-  platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
+  phi::DeviceManager::SetDevice(dst_place);
+  phi::stream::Stream stream_wrapper(dst_place, stream);
+  phi::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
       dst, src, num, &stream_wrapper);
 }
 
@@ -82,16 +82,16 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
           << dst_place << ", stream=" << stream;
 
   if (src_type == dst_type) {
-    platform::DeviceManager::SetDevice(src_place);
-    platform::stream::Stream stream_wrapper(src_place, stream);
+    phi::DeviceManager::SetDevice(src_place);
+    phi::stream::Stream stream_wrapper(src_place, stream);
 
     auto src_id = platform::PlaceHelper::GetDeviceId(src_place);
     auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place);
     if (src_id == dst_id) {
-      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
+      phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
           dst, src, num, &stream_wrapper);
     } else {
-      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
+      phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
           dst_place, dst, src, num, &stream_wrapper);
     }
   } else {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 04c8a329e5e..5a47443fd0b 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -117,7 +117,7 @@ endif()
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # seperate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context custom_kernel)
+cc_library(init SRCS init.cc DEPS device_context phi_custom_kernel)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index ecad5340d71..cbf3fdd263b 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,15 +1,3 @@
-IF(WITH_CUSTOM_DEVICE)
-cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
-
-cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
-
-cc_library(stream SRCS stream.cc DEPS callback_manager)
-
-cc_library(event SRCS event.cc DEPS enforce place)
-
-cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
-
-ENDIF()
 
 set(DEV_LIBS custom_device)
 
@@ -37,11 +25,3 @@ ENDIF()
 IF(WITH_MLU)
   add_subdirectory(mlu)
 ENDIF()
-
-# CUSTOM
-IF(WITH_CUSTOM_DEVICE)
-  add_subdirectory(custom)
-
-  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
-  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
-ENDIF()
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
deleted file mode 100644
index f39c60c0c68..00000000000
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-IF(WITH_CUSTOM_DEVICE)
-cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
-cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
-ENDIF()
diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h
index fbdb4627aba..ba92b4ac7de 100644
--- a/paddle/fluid/platform/device/custom/enforce_custom.h
+++ b/paddle/fluid/platform/device/custom/enforce_custom.h
@@ -14,7 +14,10 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_ext.h"
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/device_ext.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index ba3461d8c14..6803a39a4fd 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -40,10 +40,10 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/callback_manager.h"
 #include "paddle/fluid/platform/device/custom/enforce_custom.h"
-#include "paddle/fluid/platform/device/device_guard.h"
-#include "paddle/fluid/platform/device/device_manager.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 #endif
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6a7956628f8..f60cbc48694 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -903,7 +903,7 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 CustomDeviceContext::CustomDeviceContext(CustomPlace place)
     : phi::CustomContext(place) {
   Init();
-  stream_.reset(new platform::stream::Stream(place, stream()));
+  stream_.reset(new phi::stream::Stream(place, stream()));
 }
 
 CustomDeviceContext::~CustomDeviceContext() {}
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e9124dfc1f8..29b6477b683 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -72,8 +72,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
 
-#include "paddle/fluid/platform/device/device_ext.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/stream.h"
 
 #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -838,7 +838,7 @@ class CustomDeviceContext : public phi::CustomContext {
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
  private:
-  std::shared_ptr<platform::stream::Stream> stream_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 template <>
 struct DefaultDeviceContextType<platform::CustomPlace> {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 372bfbce2ac..cf85dede8e8 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -55,7 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
 
-#include "paddle/fluid/framework/custom_kernel.h"
+#include "paddle/phi/core/custom_kernel.h"
 
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
@@ -145,7 +145,7 @@ void InitCupti() {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 void LoadCustomDevice(const std::string &library_dir) {
   LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]";
-  std::vector<std::string> libs = platform::ListAllLibraries(library_dir);
+  std::vector<std::string> libs = phi::ListAllLibraries(library_dir);
   for (const auto &lib_path : libs) {
     auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
     PADDLE_ENFORCE_NOT_NULL(
@@ -153,8 +153,8 @@ void LoadCustomDevice(const std::string &library_dir) {
         platform::errors::InvalidArgument(
             "Fail to open library: %s with error: %s", lib_path, dlerror()));
 
-    platform::LoadCustomRuntimeLib(lib_path, dso_handle);
-    framework::LoadCustomKernelLib(lib_path, dso_handle);
+    phi::LoadCustomRuntimeLib(lib_path, dso_handle);
+    phi::LoadCustomKernelLib(lib_path, dso_handle);
   }
   LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir
             << "]";
@@ -259,9 +259,9 @@ void InitDevices(const std::vector<int> devices) {
       LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
       LoadCustomDevice(custom_kernel_root);
 
-      auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+      auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
       for (auto &dev_type : device_types) {
-        auto device_count = platform::DeviceManager::GetDeviceCount(dev_type);
+        auto device_count = phi::DeviceManager::GetDeviceCount(dev_type);
         LOG(INFO) << "CustomDevice: " << dev_type
                   << ", visible devices count: " << device_count;
         for (size_t i = 0; i < device_count; i++) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ffc42dc30ed..c016321ef80 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1668,7 +1668,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_all_device_type", []() {
     std::vector<std::string> device_types;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    device_types = platform::DeviceManager::GetAllDeviceTypes();
+    device_types = phi::DeviceManager::GetAllDeviceTypes();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_all_device_type because you have installed"
@@ -1682,7 +1682,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_all_custom_device_type", []() {
     std::vector<std::string> device_types;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_all_custom_device_type because you have installed"
@@ -1696,7 +1696,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_available_device", [] {
     std::vector<std::string> devices;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    devices = platform::DeviceManager::GetAllDeviceList();
+    devices = phi::DeviceManager::GetAllDeviceList();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_available_device because you have installed"
@@ -1710,7 +1710,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_available_custom_device", [] {
     std::vector<std::string> devices;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    devices = platform::DeviceManager::GetAllCustomDeviceList();
+    devices = phi::DeviceManager::GetAllCustomDeviceList();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_available_custom_device because you have "
@@ -1747,10 +1747,10 @@ All parameter, weight, gradient are variables in Paddle.
                std::exit(-1);
              }
 
-             if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) &&
-                        platform::DeviceManager::IsCustom(device_type))) {
+             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
+                        phi::DeviceManager::IsCustom(device_type))) {
                int dev_count = static_cast<int>(
-                   platform::DeviceManager::GetDeviceCount(device_type));
+                   phi::DeviceManager::GetDeviceCount(device_type));
                if (UNLIKELY(dev_id >= dev_count)) {
                  if (dev_count == 0) {
                    LOG(ERROR) << "Cannot use " << device_type
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index e7abd64ec44..c593c7df3e0 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -393,10 +393,10 @@ void SetTensorFromPyArrayT(
   } else if (paddle::platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
     platform::Place tmp_place = place;
-    platform::DeviceGuard guard(tmp_place);
+    phi::DeviceGuard guard(tmp_place);
     auto dst = self->mutable_data<T>(place);
 
-    platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
+    phi::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
         reinterpret_cast<void *>(dst),
         const_cast<void *>(reinterpret_cast<const void *>(array.data())),
         array.nbytes());
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 43e477ef32e..5f616155546 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -24,4 +24,11 @@ endif()
 
 if(WITH_CUSTOM_DEVICE)
   add_dependencies(phi_context custom_context)
+  cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
+  cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
+  cc_library(stream SRCS stream.cc DEPS callback_manager)
+  cc_library(event SRCS event.cc DEPS enforce place)
+  cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
+  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
+  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
 endif()
diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
similarity index 84%
rename from paddle/fluid/platform/device/callback_manager.cc
rename to paddle/phi/backends/callback_manager.cc
index c677bc0262f..e21e8502d8f 100644
--- a/paddle/fluid/platform/device/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/phi/backends/callback_manager.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 CallbackManager::CallbackManager(stream::Stream *stream)
     : stream_(stream), thread_pool_(1) {}
@@ -32,12 +31,12 @@ void CallbackManager::AddCallback(std::function<void()> callback) const {
     });
   });
 
-  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+  phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->AddCallback(stream_, func);
 }
 
 void CallbackManager::Wait() const {
-  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+  phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->SynchronizeStream(stream_);
 
   {
@@ -48,5 +47,4 @@ void CallbackManager::Wait() const {
   }
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/phi/backends/callback_manager.h
similarity index 94%
rename from paddle/fluid/platform/device/callback_manager.h
rename to paddle/phi/backends/callback_manager.h
index 0edc694c94b..a15cb075668 100644
--- a/paddle/fluid/platform/device/callback_manager.h
+++ b/paddle/phi/backends/callback_manager.h
@@ -32,8 +32,7 @@
 
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 namespace stream {
 class Stream;
@@ -58,5 +57,4 @@ class CallbackManager {
   mutable std::future<void> last_future_;
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index cb54d367568..5b46afb4ce9 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -1,3 +1,5 @@
 if (WITH_CUSTOM_DEVICE)
   cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager)
+  cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
+  cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context)
 endif()
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index bde3b6a0853..e34e0f94b70 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/backends/custom/custom_context.h"
 
-#include "paddle/fluid/platform/device/device_guard.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 
 namespace phi {
 
@@ -25,8 +25,8 @@ struct CustomContext::Impl {
   ~Impl() {}
 
   void Init() {
-    paddle::platform::DeviceGuard guard(place_);
-    stream_.reset(new paddle::platform::stream::Stream());
+    phi::DeviceGuard guard(place_);
+    stream_.reset(new phi::stream::Stream());
     stream_->Init(place_);
   }
 
@@ -40,7 +40,7 @@ struct CustomContext::Impl {
 
   Place place_;
 
-  std::shared_ptr<paddle::platform::stream::Stream> stream_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 
 void CustomContext::Init() { impl_->Init(); }
diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
similarity index 81%
rename from paddle/fluid/platform/device/custom/custom_device.cc
rename to paddle/phi/backends/custom/custom_device.cc
index 09f0421a878..df757b286a6 100644
--- a/paddle/fluid/platform/device/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -12,23 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_base.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device/custom/enforce_custom.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/device_base.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 
 static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
   return d1.id == d2.id;
 }
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class CustomDevice : public DeviceInterface {
  public:
-  CustomDevice(const std::string& type, int priority, bool is_custom,
-               std::unique_ptr<C_DeviceInterface> pimpl, void* dso_handle)
+  CustomDevice(const std::string& type,
+               int priority,
+               bool is_custom,
+               std::unique_ptr<C_DeviceInterface> pimpl,
+               void* dso_handle)
       : DeviceInterface(type, priority, is_custom),
         pimpl_(std::move(pimpl)),
         dso_handle_(dso_handle) {
@@ -122,14 +127,15 @@ class CustomDevice : public DeviceInterface {
     return device.id;
   }
 
-  void CreateStream(size_t dev_id, stream::Stream* stream,
+  void CreateStream(size_t dev_id,
+                    stream::Stream* stream,
                     const stream::Stream::Priority& priority =
                         stream::Stream::Priority::kNormal,
                     const stream::Stream::Flag& flag =
                         stream::Stream::Flag::kDefaultFlag) override {
     if (priority != stream::Stream::Priority::kNormal ||
         flag != stream::Stream::Flag::kDefaultFlag) {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "priority != stream::Stream::Priority::kNormal || flag != "
           "stream::Stream::Flag::kDefaultFlag is not allowed on "
           "CustomDevice."));
@@ -162,23 +168,28 @@ class CustomDevice : public DeviceInterface {
       SynchronizeStream(dev_id, stream);
       return true;
     }
-    if (pimpl_->query_stream(device, reinterpret_cast<C_Stream>(
-                                         stream->raw_stream())) == C_SUCCESS) {
+    if (pimpl_->query_stream(
+            device, reinterpret_cast<C_Stream>(stream->raw_stream())) ==
+        C_SUCCESS) {
       return true;
     }
     return false;
   }
 
-  void AddCallback(size_t dev_id, stream::Stream* stream,
+  void AddCallback(size_t dev_id,
+                   stream::Stream* stream,
                    stream::Stream::Callback* callback) override {
     if (!pimpl_->stream_add_callback) {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "AddCallback is not supported on %s.", Type()));
     } else {
       const auto device = &devices_pool[dev_id];
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback(
-          device, reinterpret_cast<C_Stream>(stream->raw_stream()),
-          [](C_Device device, C_Stream stream, void* user_data,
+          device,
+          reinterpret_cast<C_Stream>(stream->raw_stream()),
+          [](C_Device device,
+             C_Stream stream,
+             void* user_data,
              C_Status* status) {
             std::unique_ptr<std::function<void()>> func(
                 reinterpret_cast<std::function<void()>*>(user_data));
@@ -188,7 +199,8 @@ class CustomDevice : public DeviceInterface {
     }
   }
 
-  void CreateEvent(size_t dev_id, event::Event* event,
+  void CreateEvent(size_t dev_id,
+                   event::Event* event,
                    event::Event::Flag flags) override {
     const auto device = &devices_pool[dev_id];
     C_Event c_event;
@@ -205,13 +217,15 @@ class CustomDevice : public DeviceInterface {
         device, reinterpret_cast<C_Event>(event->raw_event())));
   }
 
-  void RecordEvent(size_t dev_id, const event::Event* event,
+  void RecordEvent(size_t dev_id,
+                   const event::Event* event,
                    const stream::Stream* stream) override {
     const auto device = &devices_pool[dev_id];
 
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
-        reinterpret_cast<C_Event>(event->raw_event())));
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->record_event(device,
+                             reinterpret_cast<C_Stream>(stream->raw_stream()),
+                             reinterpret_cast<C_Event>(event->raw_event())));
   }
 
   void SynchronizeEvent(size_t dev_id, const event::Event* event) override {
@@ -228,78 +242,93 @@ class CustomDevice : public DeviceInterface {
       SynchronizeEvent(dev_id, event);
       return true;
     }
-    if (pimpl_->query_event(device, reinterpret_cast<C_Event>(
-                                        event->raw_event())) == C_SUCCESS) {
+    if (pimpl_->query_event(device,
+                            reinterpret_cast<C_Event>(event->raw_event())) ==
+        C_SUCCESS) {
       return true;
     }
     return false;
   }
 
-  void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+  void StreamWaitEvent(size_t dev_id,
+                       const stream::Stream* stream,
                        const event::Event* event) override {
     const auto device = &devices_pool[dev_id];
 
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        device,
+        reinterpret_cast<C_Stream>(stream->raw_stream()),
         reinterpret_cast<C_Event>(event->raw_event())));
   }
 
-  void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyH2D(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_h2d(device, dst, src, size));
     }
   }
 
-  void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyD2H(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_d2h(device, dst, src, size));
     }
   }
 
-  void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyD2D(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_d2d(device, dst, src, size));
     }
   }
 
-  void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id,
-                     const void* src, size_t size,
+  void MemoryCopyP2P(const Place& dst_place,
+                     void* dst,
+                     size_t src_dev_id,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     int dst_dev_id = PlaceToId(dst_place);
     auto dst_device = &devices_pool[dst_dev_id];
@@ -310,8 +339,12 @@ class CustomDevice : public DeviceInterface {
         MemoryCopyP2P(dst_place, dst, src_dev_id, src, size);
       } else {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p(
-            dst_device, src_device,
-            reinterpret_cast<C_Stream>(stream->raw_stream()), dst, src, size));
+            dst_device,
+            src_device,
+            reinterpret_cast<C_Stream>(stream->raw_stream()),
+            dst,
+            src,
+            size));
       }
     } else {
       if (!pimpl_->memory_copy_p2p) {
@@ -319,9 +352,9 @@ class CustomDevice : public DeviceInterface {
         MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
         MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
       } else {
-        auto src_place = platform::CustomPlace(Type(), src_dev_id);
-        platform::DeviceContextPool& pool =
-            platform::DeviceContextPool::Instance();
+        auto src_place = CustomPlace(Type(), src_dev_id);
+        paddle::platform::DeviceContextPool& pool =
+            paddle::platform::DeviceContextPool::Instance();
         pool.Get(src_place)->Wait();
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
@@ -350,8 +383,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_allocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryAllocateHost is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->host_memory_allocate(device, &ptr, size));
@@ -363,8 +396,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->host_memory_deallocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryDeallocateHost is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->host_memory_deallocate(device, ptr, size));
@@ -376,8 +409,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_allocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Unified is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryAllocateUnified is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->unified_memory_allocate(device, &ptr, size));
@@ -389,15 +422,17 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_deallocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryDeallocateUnified is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->unified_memory_deallocate(device, ptr, size));
     }
   }
 
-  void MemorySet(size_t dev_id, void* ptr, uint8_t value,
+  void MemorySet(size_t dev_id,
+                 void* ptr,
+                 uint8_t value,
                  size_t size) override {
     const auto device = &devices_pool[dev_id];
 
@@ -532,10 +567,12 @@ class CustomDevice : public DeviceInterface {
 
   inline int PlaceToId(const Place& place) {
     int dev_id = PlaceToIdNoCheck(place);
-    PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(),
-                      platform::errors::NotFound(
+    PADDLE_ENFORCE_NE(devices_pool.find(dev_id),
+                      devices_pool.end(),
+                      phi::errors::NotFound(
                           "Cannot found %s %d, please check visible devices",
-                          Type(), dev_id));
+                          Type(),
+                          dev_id));
     return dev_id;
   }
 
@@ -623,11 +660,14 @@ typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
-                          const std::string& dso_lib_path, void* dso_handle) {
+                          const std::string& dso_lib_path,
+                          void* dso_handle) {
   if (ValidCustomCustomRuntimeParams(&runtime_params)) {
-    auto device =
-        std::make_unique<CustomDevice>(runtime_params.device_type, 255, true,
-                                       std::move(device_interface), dso_handle);
+    auto device = std::make_unique<CustomDevice>(runtime_params.device_type,
+                                                 255,
+                                                 true,
+                                                 std::move(device_interface),
+                                                 dso_handle);
     if (false == DeviceManager::Register(std::move(device))) {
       LOG(WARNING) << "Skipped lib [" << dso_lib_path
                    << "]. Register failed!!! there may be a "
@@ -665,10 +705,9 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle) {
                     "compatibility between PaddlePaddle and Custom Runtime.";
     return;
   }
-  LoadCustomRuntimeLib(runtime_params, std::move(device_interface),
-                       dso_lib_path, dso_handle);
+  LoadCustomRuntimeLib(
+      runtime_params, std::move(device_interface), dso_lib_path, dso_handle);
   LOG(INFO) << "Successed in loading custom runtime in lib: " << dso_lib_path;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
similarity index 86%
rename from paddle/fluid/platform/device/custom/custom_device_test.cc
rename to paddle/phi/backends/custom/custom_device_test.cc
index e42fbbb9448..53b88f9b4ac 100644
--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -17,9 +17,9 @@
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
-#include "paddle/fluid/platform/device/device_manager.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/custom/fake_cpu_device.h"
+#include "paddle/phi/backends/device_manager.h"
 
 void RegisterDevice() {
   CustomRuntimeParams runtime_params;
@@ -30,23 +30,22 @@ void RegisterDevice() {
   runtime_params.interface->size = sizeof(C_DeviceInterface);
 
   InitFakeCPUDevice(&runtime_params);
-  paddle::platform::LoadCustomRuntimeLib(
+  phi::LoadCustomRuntimeLib(
       runtime_params, std::move(device_interface), "", nullptr);
 }
 
 void InitDevice() {
   RegisterDevice();
-  EXPECT_GT(static_cast<int>(
-                paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
+  EXPECT_GT(static_cast<int>(phi::DeviceManager::GetAllDeviceTypes().size()),
             0);
   auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
-  auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   EXPECT_NE(device, nullptr);
 
   std::vector<paddle::platform::Place> places;
-  auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  auto device_types = phi::DeviceManager::GetAllDeviceTypes();
   for (auto dev_type : device_types) {
-    auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
+    auto devices = phi::DeviceManager::GetDeviceList(dev_type);
     for (auto dev_id : devices) {
       places.push_back(
           paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
@@ -60,14 +59,14 @@ void InitDevice() {
 void TestDeviceInterface(const paddle::platform::Place& place) {
   std::cout << "TestDeviceInterface on " << place << std::endl;
   if (paddle::platform::is_custom_place(place)) {
-    auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+    auto device = phi::DeviceManager::GetDeviceWithPlace(place);
     auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
-    auto p1 = device->MemoryAllocate(
-        paddle::platform::DeviceManager::GetMinChunkSize(place));
+    auto p1 =
+        device->MemoryAllocate(phi::DeviceManager::GetMinChunkSize(place));
     EXPECT_NE(p1, nullptr);
 
-    paddle::platform::DeviceManager::SetDevice(place);
-    auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
+    phi::DeviceManager::SetDevice(place);
+    auto dev_id = phi::DeviceManager::GetDevice(dev_type);
     EXPECT_EQ(dev_id, place.GetDeviceId());
   }
 }
@@ -168,11 +167,10 @@ void TestTensorUtils(const paddle::platform::Place& place) {
 
 TEST(CustomDevice, Tensor) {
   InitDevice();
-  auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
   for (const auto& dev_type : dev_types) {
     std::cout << "Test on " << dev_type << std::endl;
-    EXPECT_GT(static_cast<int>(
-                  paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
+    EXPECT_GT(static_cast<int>(phi::DeviceManager::GetDeviceCount(dev_type)),
               0);
     auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
 
diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/phi/backends/custom/fake_cpu_device.h
similarity index 90%
rename from paddle/fluid/platform/device/custom/fake_cpu_device.h
rename to paddle/phi/backends/custom/fake_cpu_device.h
index c6d8ade4b08..22c344a0e04 100644
--- a/paddle/fluid/platform/device/custom/fake_cpu_device.h
+++ b/paddle/phi/backends/custom/fake_cpu_device.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/phi/backends/device_ext.h"
 
 constexpr size_t global_total_memory = 1024 * 1024UL;
 static size_t global_free_memory = global_total_memory;
@@ -43,14 +43,19 @@ C_Status GetDevicesList(size_t *device) {
   return C_SUCCESS;
 }
 
-C_Status MemCpy(const C_Device device, void *dst, const void *src,
+C_Status MemCpy(const C_Device device,
+                void *dst,
+                const void *src,
                 size_t size) {
   memcpy(dst, src, size);
   return C_SUCCESS;
 }
 
-C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
-                     const void *src, size_t size) {
+C_Status AsyncMemCpy(const C_Device device,
+                     C_Stream stream,
+                     void *dst,
+                     const void *src,
+                     size_t size) {
   memcpy(dst, src, size);
   return C_SUCCESS;
 }
@@ -100,14 +105,16 @@ C_Status SyncStream(const C_Device device, C_Stream stream) {
 
 C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
 
-C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
+C_Status StreamWaitEvent(const C_Device device,
+                         C_Stream stream,
                          C_Event event) {
   return C_SUCCESS;
 }
 
 C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
 
-C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
+C_Status DeviceMemStats(const C_Device device,
+                        size_t *total_memory,
                         size_t *free_memory) {
   *total_memory = global_total_memory;
   *free_memory = global_free_memory;
@@ -139,7 +146,8 @@ void InitFakeCPUDevice(CustomRuntimeParams *params) {
   params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
   params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
 
-  memset(reinterpret_cast<void *>(params->interface), 0,
+  memset(reinterpret_cast<void *>(params->interface),
+         0,
          sizeof(C_DeviceInterface));
 
   params->interface->initialize = Init;
diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/phi/backends/device_base.cc
similarity index 68%
rename from paddle/fluid/platform/device/device_base.cc
rename to paddle/phi/backends/device_base.cc
index 6234c961268..6f634c58af0 100644
--- a/paddle/fluid/platform/device/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/phi/backends/device_base.h"
 #include "gflags/gflags.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -21,26 +21,25 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
-#define INTERFACE_UNIMPLEMENT                   \
-  PADDLE_THROW(platform::errors::Unimplemented( \
+#define INTERFACE_UNIMPLEMENT              \
+  PADDLE_THROW(phi::errors::Unimplemented( \
       "%s is not implemented on %s device.", __func__, Type()));
 
 // info
 size_t DeviceInterface::GetComputeCapability() {
-  VLOG(10) << Type() + " get compute capability " << 0;
+  VLOG(10) << Type() << " get compute capability " << 0;
   return 0;
 }
 
 size_t DeviceInterface::GetRuntimeVersion() {
-  VLOG(10) << Type() + " get runtime version " << 0;
+  VLOG(10) << Type() << " get runtime version " << 0;
   return 0;
 }
 
 size_t DeviceInterface::GetDriverVersion() {
-  VLOG(10) << Type() + " get driver version " << 0;
+  VLOG(10) << Type() << " get driver version " << 0;
   return 0;
 }
 
@@ -62,7 +61,8 @@ void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
 int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
 
 // stream manage
-void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
+void DeviceInterface::CreateStream(size_t dev_id,
+                                   stream::Stream* stream,
                                    const stream::Stream::Priority& priority,
                                    const stream::Stream::Flag& flag) {
   INTERFACE_UNIMPLEMENT;
@@ -82,7 +82,8 @@ bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
   return true;
 }
 
-void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
+void DeviceInterface::AddCallback(size_t dev_id,
+                                  stream::Stream* stream,
                                   stream::Stream::Callback* callback) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -94,7 +95,8 @@ void DeviceInterface::StreamWaitEvent(size_t dev_id,
 }
 
 // event manage
-void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
+void DeviceInterface::CreateEvent(size_t dev_id,
+                                  event::Event* event,
                                   event::Event::Flag flags) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -103,7 +105,8 @@ void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
+void DeviceInterface::RecordEvent(size_t dev_id,
+                                  const event::Event* event,
                                   const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -119,23 +122,35 @@ bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
 }
 
 // memery manage
-void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyH2D(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyD2H(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyD2D(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
-                                    size_t src_id, const void* src, size_t size,
+void DeviceInterface::MemoryCopyP2P(const Place& dst_place,
+                                    void* dst,
+                                    size_t src_id,
+                                    const void* src,
+                                    size_t size,
                                     const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -154,7 +169,8 @@ void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
   return nullptr;
 }
 
-void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
+void DeviceInterface::MemoryDeallocateHost(size_t dev_id,
+                                           void* ptr,
                                            size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -164,12 +180,15 @@ void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
   return nullptr;
 }
 
-void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
+void DeviceInterface::MemoryDeallocateUnified(size_t dev_id,
+                                              void* ptr,
                                               size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
+void DeviceInterface::MemorySet(size_t dev_id,
+                                void* ptr,
+                                uint8_t value,
                                 size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -184,8 +203,9 @@ size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
 
 size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t available_to_alloc = AvailableAllocSize(dev_id);
-  PADDLE_ENFORCE_GT(available_to_alloc, 0,
-                    platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_GT(available_to_alloc,
+                    0,
+                    phi::errors::ResourceExhausted(
                         "Not enough available %s memory.", Type()));
   // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
   // allocated by fraction
@@ -194,8 +214,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t alloc_bytes =
       (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
                                            FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
-                    platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_GE(available_to_alloc,
+                    alloc_bytes,
+                    phi::errors::ResourceExhausted(
                         "Not enough available %s memory.", Type()));
   return alloc_bytes;
 }
@@ -217,33 +238,32 @@ size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
 
 size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
   size_t init_alloc_size = AllocSize(dev_id, false);
-  VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
+  VLOG(10) << Type() << " init alloc size " << (init_alloc_size >> 20) << "M";
   return init_alloc_size;
 }
 
 size_t DeviceInterface::GetReallocSize(size_t dev_id) {
   size_t realloc_size = AllocSize(dev_id, true);
-  VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
+  VLOG(10) << Type() << " realloc size " << (realloc_size >> 20) << "M";
   return realloc_size;
 }
 
 size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
   size_t max_alloc_size =
       std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
-  VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
+  VLOG(10) << Type() << " max alloc size " << (max_alloc_size >> 20) << "M";
   return max_alloc_size;
 }
 
 size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
   size_t max_chunk_size = GetMaxAllocSize(dev_id);
-  VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
+  VLOG(10) << Type() << " max chunk size " << (max_chunk_size >> 20) << "M";
   return max_chunk_size;
 }
 
 size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
-  VLOG(10) << Type() + " extra padding size " << 0;
+  VLOG(10) << Type() << " extra padding size " << 0;
   return 0;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/phi/backends/device_base.h
similarity index 80%
rename from paddle/fluid/platform/device/device_base.h
rename to paddle/phi/backends/device_base.h
index d70b02be80e..b4964708dfb 100644
--- a/paddle/fluid/platform/device/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -14,11 +14,10 @@
 
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class DeviceInterface {  // Driver / Runtime
  public:
@@ -66,7 +65,8 @@ class DeviceInterface {  // Driver / Runtime
   // Stream
   // ! Create an asynchronous stream
   virtual void CreateStream(
-      size_t dev_id, stream::Stream* stream,
+      size_t dev_id,
+      stream::Stream* stream,
       const stream::Stream::Priority& priority =
           stream::Stream::Priority::kNormal,
       const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
@@ -81,19 +81,22 @@ class DeviceInterface {  // Driver / Runtime
   virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
 
   // ! Add a callback to a compute stream.
-  virtual void AddCallback(size_t dev_id, stream::Stream* stream,
+  virtual void AddCallback(size_t dev_id,
+                           stream::Stream* stream,
                            stream::Stream::Callback* callback);
 
   // Event
   // ! Create an event.
-  virtual void CreateEvent(size_t dev_id, event::Event* event,
+  virtual void CreateEvent(size_t dev_id,
+                           event::Event* event,
                            event::Event::Flag flags);
 
   // ! Destroy an event.
   virtual void DestroyEvent(size_t dev_id, event::Event* event);
 
   // ! Records an event.
-  virtual void RecordEvent(size_t dev_id, const event::Event* event,
+  virtual void RecordEvent(size_t dev_id,
+                           const event::Event* event,
                            const stream::Stream* stream);
 
   // ! Waits for event to complete.
@@ -102,24 +105,34 @@ class DeviceInterface {  // Driver / Runtime
   virtual bool QueryEvent(size_t dev_id, const event::Event* event);
 
   // ! Make a compute stream wait on an event
-  virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+  virtual void StreamWaitEvent(size_t dev_id,
+                               const stream::Stream* stream,
                                const event::Event* event);
 
   // Memory
-  virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyH2D(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyD2H(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyD2D(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
-                             const void* src, size_t size,
+  virtual void MemoryCopyP2P(const Place& dst_place,
+                             void* dst,
+                             size_t src_id,
+                             const void* src,
+                             size_t size,
                              const stream::Stream* stream = nullptr);
 
   virtual void* MemoryAllocate(size_t dev_id, size_t size);
@@ -160,7 +173,6 @@ class DeviceInterface {  // Driver / Runtime
   size_t AvailableAllocSize(size_t dev_id);
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/phi/backends/device_ext.h
similarity index 78%
rename from paddle/fluid/platform/device/device_ext.h
rename to paddle/phi/backends/device_ext.h
index d1e1340f74b..bbd4966b727 100644
--- a/paddle/fluid/platform/device/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -40,7 +40,9 @@ typedef struct C_Stream_st* C_Stream;
 
 typedef struct C_Event_st* C_Event;
 
-typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
+typedef void (*C_Callback)(C_Device device,
+                           C_Stream stream,
+                           void* user_data,
                            C_Status* status);
 
 struct C_DeviceInterface {
@@ -124,8 +126,10 @@ struct C_DeviceInterface {
    * @param[C_Callback] callback
    * @param[void*]      user_data
    */
-  C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
-                                  C_Callback callback, void* user_data);
+  C_Status (*stream_add_callback)(const C_Device device,
+                                  C_Stream stream,
+                                  C_Callback callback,
+                                  void* user_data);
 
   /**
    * @brief Create an event
@@ -142,7 +146,8 @@ struct C_DeviceInterface {
    * @param[C_Stream]   stream
    * @param[C_Event]    event
    */
-  C_Status (*record_event)(const C_Device device, C_Stream stream,
+  C_Status (*record_event)(const C_Device device,
+                           C_Stream stream,
                            C_Event event);
 
   /**
@@ -191,7 +196,8 @@ struct C_DeviceInterface {
    * @param[C_Stream]   stream
    * @param[C_Event]    event
    */
-  C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
+  C_Status (*stream_wait_event)(const C_Device device,
+                                C_Stream stream,
                                 C_Event event);
 
   void* reserved_dev_api[8];
@@ -207,7 +213,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*device_memory_allocate)(const C_Device device,
+                                     void** ptr,
                                      size_t size);
 
   /**
@@ -217,7 +224,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*device_memory_deallocate)(const C_Device device,
+                                       void* ptr,
                                        size_t size);
 
   /**
@@ -228,8 +236,10 @@ struct C_DeviceInterface {
    * @param[unsigned char] value
    * @param[size_t]     size
    */
-  C_Status (*device_memory_set)(const C_Device device, void* ptr,
-                                unsigned char value, size_t size);
+  C_Status (*device_memory_set)(const C_Device device,
+                                void* ptr,
+                                unsigned char value,
+                                size_t size);
 
   /**
    * @brief Host memory allocate
@@ -238,7 +248,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*host_memory_allocate)(const C_Device device,
+                                   void** ptr,
                                    size_t size);
 
   /**
@@ -248,7 +259,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*host_memory_deallocate)(const C_Device device,
+                                     void* ptr,
                                      size_t size);
 
   /**
@@ -258,7 +270,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*unified_memory_allocate)(const C_Device device,
+                                      void** ptr,
                                       size_t size);
 
   /**
@@ -268,7 +281,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*unified_memory_deallocate)(const C_Device device,
+                                        void* ptr,
                                         size_t size);
 
   /**
@@ -279,7 +293,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_h2d)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -290,7 +306,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_d2h)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -301,7 +319,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_d2d)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -314,8 +334,10 @@ struct C_DeviceInterface {
    * @param[size_t]     size
    */
   C_Status (*memory_copy_p2p)(const C_Device dst_device,
-                              const C_Device src_device, void* dst,
-                              const void* src, size_t size);
+                              const C_Device src_device,
+                              void* dst,
+                              const void* src,
+                              size_t size);
 
   /**
    * @brief Asynchonrize memory copy from host to device
@@ -326,8 +348,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_h2d)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Asynchonrize memory copy from device to host
@@ -338,8 +363,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_d2h)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Asynchonrize memory copy from device to device
@@ -350,8 +378,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_d2d)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Peer asynchonrize memory copy from host to device
@@ -363,8 +394,11 @@ struct C_DeviceInterface {
    * @param[size_t]     size
    */
   C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
-                                    const C_Device src_device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+                                    const C_Device src_device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   void* reserved_mem_api[8];
 
@@ -394,7 +428,8 @@ struct C_DeviceInterface {
    * @param[size_t*]    free_memory
    * @param[size_t*]    used_memory
    */
-  C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
+  C_Status (*device_memory_stats)(const C_Device device,
+                                  size_t* total_memory,
                                   size_t* free_memory);
 
   /**
diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/phi/backends/device_guard.cc
similarity index 83%
rename from paddle/fluid/platform/device/device_guard.cc
rename to paddle/phi/backends/device_guard.cc
index 55d8b9dc6a9..03eaac1fb1a 100644
--- a/paddle/fluid/platform/device/device_guard.cc
+++ b/paddle/phi/backends/device_guard.cc
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/device_guard.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 // Even this source file does not contains any code, it is better to keep this
 // source file for cmake dependency.
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/phi/backends/device_guard.h
similarity index 82%
rename from paddle/fluid/platform/device/device_guard.h
rename to paddle/phi/backends/device_guard.h
index 638e9c984b4..eb14236d251 100644
--- a/paddle/fluid/platform/device/device_guard.h
+++ b/paddle/phi/backends/device_guard.h
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class DeviceGuard {
  public:
   explicit inline DeviceGuard(const Place& place)
-      : dev_type_(PlaceHelper::GetDeviceType(place)) {
+      : dev_type_(place.GetDeviceType()) {
     prev_id = DeviceManager::GetDevice(dev_type_);
-    cur_id = PlaceHelper::GetDeviceId(place);
+    cur_id = place.GetDeviceId();
 
     if (cur_id != prev_id) {
       DeviceManager::SetDevice(dev_type_, cur_id);
@@ -44,5 +43,4 @@ class DeviceGuard {
   std::string dev_type_;
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/phi/backends/device_manager.cc
similarity index 83%
rename from paddle/fluid/platform/device/device_manager.cc
rename to paddle/phi/backends/device_manager.cc
index e0db97adde8..1ffe38d8e1f 100644
--- a/paddle/fluid/platform/device/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 
 #if !defined(_WIN32)
 #include <dirent.h>
@@ -24,8 +24,7 @@
 #include <functional>
 #include <regex>
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 void Device::CreateStream(stream::Stream* stream,
                           const stream::Stream::Priority& priority,
@@ -76,23 +75,32 @@ void Device::StreamWaitEvent(const stream::Stream* stream,
   impl_->StreamWaitEvent(dev_id_, stream, event);
 }
 
-void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
+void Device::MemoryCopyH2D(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
+void Device::MemoryCopyD2H(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
+void Device::MemoryCopyD2D(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
-                           size_t size, const stream::Stream* stream) {
+void Device::MemoryCopyP2P(const Place& dst_place,
+                           void* dst,
+                           const void* src,
+                           size_t size,
+                           const stream::Stream* stream) {
   impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
 }
 
@@ -173,7 +181,7 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
   } else {
     LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
     PADDLE_THROW(
-        platform::errors::Fatal("Unregistered device type %s.", device_type));
+        phi::errors::Fatal("Unregistered device type %s.", device_type));
     return nullptr;
   }
 }
@@ -182,17 +190,21 @@ Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
   phi::AutoRDLock lock(&_global_device_manager_rw_lock);
 
   auto& dev_map = Instance().device_map_;
-  auto dev_type = PlaceHelper::GetDeviceType(place);
-  auto dev_id = PlaceHelper::GetDeviceId(place);
-  PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
-                    platform::errors::NotFound(
-                        "Unable to find Device with type %s.", dev_type));
+  auto dev_type = place.GetDeviceType();
+  auto dev_id = place.GetDeviceId();
+  PADDLE_ENFORCE_NE(
+      dev_map.find(dev_type),
+      dev_map.end(),
+      phi::errors::NotFound("Unable to find Device with type %s.", dev_type));
   auto& dev_vec = dev_map[dev_type];
   PADDLE_ENFORCE_LT(
-      dev_id, dev_vec.size(),
-      platform::errors::OutOfRange(
+      dev_id,
+      dev_vec.size(),
+      phi::errors::OutOfRange(
           "The visible devices count of type %s is %d, but dev_id is %d.",
-          dev_type, dev_vec.size(), dev_id));
+          dev_type,
+          dev_vec.size(),
+          dev_id));
   return dev_vec[dev_id].get();
 }
 
@@ -277,22 +289,22 @@ void DeviceManager::Finalize(const std::string& device_type) {
 }
 
 void DeviceManager::SynchronizeDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->SynchronizeDevice(device_id);
 }
 
 void DeviceManager::InitDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->InitDevice(device_id);
 }
 
 void DeviceManager::DeInitDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->DeInitDevice(device_id);
 }
@@ -304,8 +316,8 @@ void DeviceManager::SetDevice(const std::string& device_type,
 }
 
 void DeviceManager::SetDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   DeviceManager::SetDevice(device_type, device_id);
 }
 
@@ -315,51 +327,52 @@ int DeviceManager::GetDevice(const std::string& device_type) {
 }
 
 size_t DeviceManager::GetMinChunkSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMinChunkSize(device_id);
 }
 
 size_t DeviceManager::GetMaxChunkSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMaxChunkSize(device_id);
 }
 
 size_t DeviceManager::GetMaxAllocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMaxAllocSize(device_id);
 }
 
 size_t DeviceManager::GetInitAllocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetInitAllocSize(device_id);
 }
 
 size_t DeviceManager::GetReallocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetReallocSize(device_id);
 }
 
 size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetExtraPaddingSize(device_id);
 }
 
-void DeviceManager::MemoryStats(const Place& place, size_t* total,
+void DeviceManager::MemoryStats(const Place& place,
+                                size_t* total,
                                 size_t* free) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->MemoryStats(device_id, total, free);
 }
@@ -393,8 +406,8 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   } else {
     while ((ptr = readdir(dir)) != nullptr) {
       std::string filename(ptr->d_name);
-      if (std::regex_match(filename.begin(), filename.end(), results,
-                           express)) {
+      if (std::regex_match(
+              filename.begin(), filename.end(), results, express)) {
         libraries.push_back(library_dir + '/' + filename);
         VLOG(4) << "Found lib: " << libraries.back();
       }
@@ -405,6 +418,5 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   return libraries;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 #endif
diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/phi/backends/device_manager.h
similarity index 83%
rename from paddle/fluid/platform/device/device_manager.h
rename to paddle/phi/backends/device_manager.h
index d3aaafcddf7..c0911a0f8d5 100644
--- a/paddle/fluid/platform/device/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -15,17 +15,16 @@
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 
-#include "paddle/fluid/platform/device/device_base.h"
-#include "paddle/fluid/platform/device/device_ext.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/device_base.h"
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
+#include "paddle/phi/common/place.h"
 
 #include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 class Device final {
  public:
   Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
@@ -33,8 +32,9 @@ class Device final {
   // Stream
   // ! Create an asynchronous stream
   void CreateStream(
-      stream::Stream* stream, const stream::Stream::Priority& priority =
-                                  stream::Stream::Priority::kNormal,
+      stream::Stream* stream,
+      const stream::Stream::Priority& priority =
+          stream::Stream::Priority::kNormal,
       const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
 
   // ! Destroys an asynchronous stream.
@@ -69,17 +69,26 @@ class Device final {
   void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
 
   // Memory
-  void MemoryCopyH2D(void* dst, const void* src, size_t size,
+  void MemoryCopyH2D(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyD2H(void* dst, const void* src, size_t size,
+  void MemoryCopyD2H(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyD2D(void* dst, const void* src, size_t size,
+  void MemoryCopyD2D(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
-                     size_t size, const stream::Stream* stream = nullptr);
+  void MemoryCopyP2P(const Place& dst_place,
+                     void* dst,
+                     const void* src,
+                     size_t size,
+                     const stream::Stream* stream = nullptr);
 
   void* MemoryAllocate(size_t size);
 
@@ -168,7 +177,8 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
-                          const std::string& dso_lib_path, void* dso_handle);
+                          const std::string& dso_lib_path,
+                          void* dso_handle);
 
 class Registrar {
  public:
@@ -180,7 +190,6 @@ class Registrar {
   void Touch() {}
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/platform/device/event.cc b/paddle/phi/backends/event.cc
similarity index 84%
rename from paddle/fluid/platform/device/event.cc
rename to paddle/phi/backends/event.cc
index 6e6316ea16d..a474536f865 100644
--- a/paddle/fluid/platform/device/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/event.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 namespace event {
 
 event_t Event::raw_event() const { return event_; }
@@ -27,7 +26,7 @@ void Event::set_event(event_t event) { event_ = event; }
 
 Event::Event(const Place& place, event_t event)
     : place_(place),
-      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      device_(phi::DeviceManager::GetDeviceWithPlace(place)),
       event_(event),
       own_data_(false) {}
 
@@ -60,5 +59,4 @@ void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
 const Place& Event::GetPlace() const { return place_; }
 
 }  // namespace event
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/event.h b/paddle/phi/backends/event.h
similarity index 94%
rename from paddle/fluid/platform/device/event.h
rename to paddle/phi/backends/event.h
index 376d73eb666..f2e86343f8f 100644
--- a/paddle/fluid/platform/device/event.h
+++ b/paddle/phi/backends/event.h
@@ -15,8 +15,7 @@
 #pragma once
 #include "paddle/fluid/platform/place.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class Device;
 
@@ -57,5 +56,4 @@ class Event {
 };
 }  // namespace event
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/phi/backends/stream.cc
similarity index 84%
rename from paddle/fluid/platform/device/stream.cc
rename to paddle/phi/backends/stream.cc
index 7f867e5ee77..30939f31fcc 100644
--- a/paddle/fluid/platform/device/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/stream.h"
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/event.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/event.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 namespace stream {
 
 Stream::~Stream() { Destroy(); }
@@ -30,15 +29,16 @@ void Stream::set_stream(stream_t stream) { stream_ = stream; }
 // For compatiable
 Stream::Stream(const Place& place, stream_t stream)
     : place_(place),
-      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      device_(phi::DeviceManager::GetDeviceWithPlace(place)),
       stream_(stream),
       callback_manager_(new CallbackManager(this)),
       own_data_(false) {}
 
-bool Stream::Init(const Place& place, const Priority& priority,
+bool Stream::Init(const Place& place,
+                  const Priority& priority,
                   const Flag& flag) {
   place_ = place;
-  device_ = platform::DeviceManager::GetDeviceWithPlace(place);
+  device_ = phi::DeviceManager::GetDeviceWithPlace(place);
   DeviceGuard guard(place_);
   device_->CreateStream(this, priority, flag);
 
@@ -92,5 +92,4 @@ void Stream::Synchronize() const { device_->SynchronizeStream(this); }
 const Place& Stream::GetPlace() const { return place_; }
 
 }  // namespace stream
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/stream.h b/paddle/phi/backends/stream.h
similarity index 89%
rename from paddle/fluid/platform/device/stream.h
rename to paddle/phi/backends/stream.h
index 25cf705ee09..6c26ab3c2d5 100644
--- a/paddle/fluid/platform/device/stream.h
+++ b/paddle/phi/backends/stream.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/callback_manager.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/callback_manager.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class Device;
 
@@ -49,7 +48,8 @@ class Stream {
   ~Stream();
   const stream_t& raw_stream() const;
   void set_stream(stream_t stream);
-  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
+  bool Init(const Place& place,
+            const Priority& priority = Priority::kNormal,
             const Flag& flag = Flag::kDefaultFlag);
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
@@ -75,5 +75,4 @@ class Stream {
 };
 
 }  // namespace stream
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 8ffacbb39bb..424c4ce2ebc 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
 cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
-cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
+cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils op_registry phi_tensor_raw)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index b85db07bd9d..67245f1da5a 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/op_utils.h"
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 #endif
 
 namespace phi {
@@ -83,9 +83,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
       if (!device_type.empty()) {
         return phi::CustomPlace(
             device_type,
-            set_device_id
-                ? paddle::platform::DeviceManager::GetDevice(device_type)
-                : 0);
+            set_device_id ? phi::DeviceManager::GetDevice(device_type) : 0);
       }
 #endif
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index 58f9e1c623e..a333874d03e 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+
 #include "paddle/phi/core/custom_kernel.h"
 
 namespace phi {
@@ -50,6 +55,25 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
   }
 }
 
+void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
+#ifdef _LINUX
+  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
+  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
+      dlsym(dso_handle, "PD_GetCustomKernelMap"));
+
+  if (func == nullptr) {
+    LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
+                 << "PD_GetCustomKernelMap symbol in this lib.";
+    return;
+  }
+  auto& custom_kernel_map = func();
+  phi::RegisterCustomKernels(custom_kernel_map);
+  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
+#else
+  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
+#endif
+  return;
+}
 }  // namespace phi
 
 #ifdef __cplusplus
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
index 20ae2b7bb73..ffd12b9dd03 100644
--- a/paddle/phi/core/custom_kernel.h
+++ b/paddle/phi/core/custom_kernel.h
@@ -46,4 +46,6 @@ class CustomKernelMap {
  */
 void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
 
+// Load custom kernel lib and register
+void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
 }  // namespace phi
diff --git a/python/setup.py.in b/python/setup.py.in
index ec1b1cbcb95..91580614fa9 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -579,8 +579,7 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # phi backends headers
     # utila api headers
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -625,8 +624,6 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'device_ext.h' in header:
-                install_dir = "paddle/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-- 
GitLab


From 97ccaa796e3b401d87d6da8f27c6d22934640891 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 3 Mar 2022 14:28:27 +0800
Subject: [PATCH 096/272] [Eager][Yaml]Supported Scalar and ScalarArray for
 AutoCodeGen (#40080)

---
 .../final_state_generator/eager_gen.py        |   4 +-
 .../final_state_generator/python_c_gen.py     |  24 ++---
 paddle/fluid/pybind/eager_utils.cc            | 100 ++++++++++++++++--
 paddle/fluid/pybind/eager_utils.h             |  10 ++
 4 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index af9540b6fb3..65dbb0368c6 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -31,7 +31,9 @@ yaml_types_mapping = {
   'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
-    'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>'
+    'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
+    'Scalar' : 'Scalar',
+    'ScalarArray' : 'ScalarArray'
 }
 
 
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 9329dc5ffc9..9c4e102ca45 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
@@ -27,21 +27,9 @@ atype_to_parsing_function = {
     "long[]": "CastPyArg2Longs",
     "float[]": "CastPyArg2Floats",
     "double[]": "CastPyArg2Float64s",
-    "string[]": "CastPyArg2Strings"
-}
-
-atype_to_cxx_type = {
-    "bool": "bool",
-    "int": "int",
-    "long": "long",
-    "float": "float",
-    "string": "std::string",
-    "bool[]": "std::vector<bool>",
-    "int[]": "std::vector<int>",
-    "long[]": "std::vector<long>",
-    "float[]": "std::vector<float>",
-    "double[]": "std::vector<double>",
-    "string[]": "std::vector<std::string>"
+    "string[]": "CastPyArg2Strings",
+    "Scalar": "CastPyArg2Scalar",
+    "ScalarArray": "CastPyArg2ScalarArray"
 }
 
 
@@ -56,10 +44,10 @@ def ParseArguments():
 
 
 def GetCxxType(atype):
-    if atype not in atype_to_cxx_type.keys():
+    if atype not in yaml_types_mapping.keys():
         assert False
 
-    return atype_to_cxx_type[atype]
+    return yaml_types_mapping[atype]
 
 
 def FindParsingFunctionFromAttributeType(atype):
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 57f37621d3b..7647930ef07 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -587,14 +587,9 @@ paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
       reinterpret_cast<TensorObject*>(obj)->tensor);
 }
 
-// For Intermediate State Dygraph,
-// we use an uninitialized Tensor to represent dispensable Tensor
-paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
-                                                const std::string& arg_name,
-                                                PyObject* args, ssize_t arg_idx,
-                                                bool dispensable) {
-  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
-
+static paddle::experimental::Tensor& GetTensorFromPyObject(
+    const std::string& op_type, const std::string& arg_name, PyObject* obj,
+    ssize_t arg_idx, bool dispensable) {
   if (PyTuple_Check(obj)) {
     obj = PyTuple_GET_ITEM(obj, 0);
   }
@@ -612,6 +607,16 @@ paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
   return reinterpret_cast<TensorObject*>(obj)->tensor;
 }
 
+// For Intermediate State Dygraph,
+// we use an uninitialized Tensor to represent dispensable Tensor
+paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
+                                                const std::string& arg_name,
+                                                PyObject* args, ssize_t arg_idx,
+                                                bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  return GetTensorFromPyObject(op_type, arg_name, obj, arg_idx, dispensable);
+}
+
 std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable) {
@@ -746,5 +751,84 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
   return result;
 }
 
+paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "bool, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // obj could be: int, float, bool, paddle.Tensor
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "int") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (type_name == "float") {
+    float value = CastPyArg2Float(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+
+  } else if (type_name == "bool") {
+    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+
+  } else if (type_name == "paddle.Tensor") {
+    paddle::experimental::Tensor& value = GetTensorFromPyObject(
+        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
+    return paddle::experimental::Scalar(value);
+
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "bool, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // Fake a Scalar
+  return paddle::experimental::Scalar(1.0);
+}
+
+paddle::experimental::ScalarArray CastPyArg2ScalarArray(
+    PyObject* obj, const std::string& op_type, ssize_t arg_pos) {
+  // In case of ScalarArray, only two possible PyObjects:
+  // 1. list of int
+  // 2. Tensor
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "bool, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // obj could be: int, float, bool, paddle.Tensor
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "list") {
+    std::vector<int> value = CastPyArg2Ints(obj, op_type, arg_pos);
+    return paddle::experimental::ScalarArray(value);
+
+  } else if (type_name == "paddle.Tensor") {
+    paddle::experimental::Tensor& value = GetTensorFromPyObject(
+        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
+    return paddle::experimental::ScalarArray(value);
+
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "bool, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // Fake a ScalarArray
+  return paddle::experimental::ScalarArray({1});
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 92afc3ae487..6e990691776 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -11,7 +11,10 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
@@ -90,6 +93,13 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos);
+
+paddle::experimental::ScalarArray CastPyArg2ScalarArray(
+    PyObject* obj, const std::string& op_type, ssize_t arg_pos);
+
 paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
-- 
GitLab


From 4c0511faa406cde9db59f1233f6791e0e7c4098d Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Thu, 3 Mar 2022 15:17:01 +0800
Subject: [PATCH 097/272] Support cuda graph in StreamSafeCudaAllocator
 (#39594)

* Support cuda graph in StreamSafeCudaAllocator

* Fix CI error

* Arrange AllocatorFacade

* Fix CI error

* Fix CI error

* Fix ROCM Compile error

* Fix ROCM Compile error
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   2 +-
 .../memory/allocation/allocator_facade.cc     | 276 +++++-------
 .../memory/allocation/allocator_facade.h      |  11 +-
 .../allocation/stream_safe_cuda_allocator.cc  | 120 +++--
 .../allocation/stream_safe_cuda_allocator.h   |  28 +-
 paddle/fluid/memory/malloc.cc                 |  10 +-
 paddle/fluid/memory/malloc.h                  |   6 +-
 .../memory/stream_safe_cuda_alloc_test.cu     | 409 +++++++++---------
 8 files changed, 436 insertions(+), 426 deletions(-)

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 6cd7d873323..a7a417c29a7 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -17,7 +17,7 @@ if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
   nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
 
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 6b7828236a8..61e292a922f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
         InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
-          }
-        } else {
+        if (!FLAGS_use_stream_safe_cuda_allocator) {
           for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
     }
 
     CheckAllocThreadSafe();
+
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+      WrapCUDAGraphAllocator();
+    }
+#endif
   }
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
                                 allocation.get()));
     return stream_safe_cuda_allocation->GetOwningStream();
   }
-
-#ifdef PADDLE_WITH_CUDA
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
-    }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
-  }
-
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
-#endif
 #endif
 
  private:
@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
     platform::Place place_;
   };
 
-  const AllocatorMap& GetAllocatorMap() {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
-      auto id = platform::CUDAGraph::CapturingID();
-      auto iter = cuda_graph_allocator_map_.find(id);
-      PADDLE_ENFORCE_NE(
-          iter, cuda_graph_allocator_map_.end(),
-          platform::errors::PermissionDenied(
-              "No memory pool is prepared for CUDA Graph capturing."));
-      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
-      return iter->second->allocators_;
-    } else {
-      return allocators_;
-    }
-#else
-    return allocators_;
-#endif
-  }
+  const AllocatorMap& GetAllocatorMap() { return allocators_; }
 
   void InitNaiveBestFitCPUAllocator() {
     allocators_[platform::CPUPlace()] =
@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
   }
 
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    const std::shared_ptr<Allocator>& underlying_allocator =
-        cuda_allocators_[p][stream];
-    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
-        underlying_allocator, p, stream);
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<StreamSafeCUDAAllocator>(
+        allocator, p, stream,
+        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
   }
 
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
         retry_time, 0,
         platform::errors::InvalidArgument(
             "Retry time should be larger than 0, but got %d", retry_time));
-    std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
+#ifdef PADDLE_WITH_CUDA
+  void WrapCUDAGraphAllocator() {
+    for (auto& item : allocators_) {
+      auto& allocator = item.second;
+      allocator = CUDAGraphAllocator::Create(allocator);
+    }
+  }
+#endif
+
   static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
     for (auto& place_pair : allocators) {
       for (auto& stream_pair : place_pair.second) {
@@ -864,10 +824,6 @@ class AllocatorFacadePrivate {
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
   std::shared_timed_mutex cuda_allocator_mutex_;
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
-      cuda_graph_allocator_map_;
-#endif
 #endif
   AllocatorStrategy strategy_;
   AllocatorMap allocators_;
@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
 AllocatorFacade::~AllocatorFacade() {}
 
 AllocatorFacade& AllocatorFacade::Instance() {
-  static AllocatorFacade instance;
-  return instance;
+  static AllocatorFacade* instance = new AllocatorFacade;
+  return *instance;
+}
+
+AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    auto id = platform::CUDAGraph::CapturingID();
+    auto iter = cuda_graph_map_.find(id);
+    PADDLE_ENFORCE_NE(
+        iter, cuda_graph_map_.end(),
+        platform::errors::PermissionDenied(
+            "No memory pool is prepared for CUDA Graph capturing."));
+    VLOG(10) << "Choose CUDA Graph memory pool";
+    return iter->second.get();
+  }
+#endif
+  return m_;
 }
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-
+    AllocatorFacadePrivate* m = GetPrivate();
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
+    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
   }
 #endif
 
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 
 void* AllocatorFacade::GetBasePtr(
@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
                         "GetBasePtr() is only implemented for CUDAPlace(), not "
                         "suppot place: %s",
                         allocation->place()));
-  return m_->GetBasePtr(allocation);
+  return GetPrivate()->GetBasePtr(allocation);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place, const gpuStream_t& stream) {
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-    return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
   }
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 #endif
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
     const platform::Place& place) {
-  return m_->GetAllocator(place, /* zero size */ 0);
+  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
 }
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place, size)->Allocate(size);
-    }
-#endif
-
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
+    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
+        GetPrivate()->GetDefaultStream(cuda_place)));
+    return Alloc(cuda_place, size, default_stream);
   }
 #endif
-
-  return m_->GetAllocator(place, size)->Allocate(size);
+  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_
-          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
-          ->Release(place);
-    }
-#endif
-
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, m_->GetDefaultStream(cuda_place));
+    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
   }
 #endif
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+  return GetPrivate()
+      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
 }
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
           "multi-stream 'AllocaShared' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<phi::Allocation>(Alloc(place, size, s));
-#else
-  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
-#endif
+  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
 }
 
-bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<phi::Allocation>& allocation,
-    const phi::Stream& stream) {
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
           "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'InSameStream' function. To enable it, you can enter"
+          "multi-stream 'Alloc' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
 
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  platform::CUDAPlace p(place.GetDeviceId());
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+    return GetPrivate()
+        ->GetAllocator(p, s, /* create_if_not_found = */ true)
+        ->Allocate(size);
+  } else {
+    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
   }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return s == GetStream(allocation);
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }
 
+bool AllocatorFacade::InSameStream(
+    const std::shared_ptr<phi::Allocation>& allocation,
+    const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
-                                     const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
           "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'Alloc' function. To enable it, you can enter"
+          "multi-stream 'InSameStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return s == GetStream(allocation);
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  platform::CUDAPlace p(place.GetDeviceId());
-  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
-        ->Allocate(size);
-  } else {
-    return m_->GetAllocator(p, size)->Allocate(size);
-  }
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
           "multi-stream 'Release' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetAllocator(place, stream)->Release(place);
+  return GetPrivate()->GetAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
           "'RecordStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  m_->RecordStream(allocation, stream);
+  GetPrivate()->RecordStream(allocation, stream);
 }
 
 const gpuStream_t& AllocatorFacade::GetStream(
@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
           "'GetStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetStream(allocation);
+  return GetPrivate()->GetStream(allocation);
 }
 
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-  return m_->PrepareMemoryPoolForCUDAGraph(id);
+  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported when the "
+                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                        "FLAGS_allocator_strategy=\"%s\"",
+                        FLAGS_allocator_strategy));
+  auto& allocator = cuda_graph_map_[id];
+  PADDLE_ENFORCE_EQ(
+      allocator.get(), nullptr,
+      platform::errors::InvalidArgument(
+          "The memory pool of the CUDA Graph with ID %d have been prepared.",
+          id));
+  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
 
 void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-  return m_->RemoveMemoryPoolOfCUDAGraph(id);
+  auto iter = cuda_graph_map_.find(id);
+  PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find CUDA Graph with ID = %d", id));
+  cuda_graph_map_.erase(iter);
+  VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
 }
 #endif
 #endif
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 1722a06b01f..9066bb284e2 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -49,6 +49,8 @@ class AllocatorFacade {
 
   static AllocatorFacade& Instance();
 
+  AllocatorFacadePrivate* GetPrivate() const;
+
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
@@ -73,13 +75,14 @@ class AllocatorFacade {
                                           size_t size,
                                           const phi::Stream& stream);
 
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      const phi::Stream& stream);
+
   bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                     const phi::Stream& stream);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
-  AllocationPtr Alloc(const platform::Place& place, size_t size,
-                      const gpuStream_t& stream);
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
@@ -96,6 +99,10 @@ class AllocatorFacade {
  private:
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+      cuda_graph_map_;
+#endif
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 8627e3e6f88..072c4dee3bc 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -15,56 +15,52 @@
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream,
+    StreamSafeCUDAAllocator* allocator)
     : Allocation(underlying_allocation->ptr(),
                  underlying_allocation->base_ptr(),
                  underlying_allocation->size(), underlying_allocation->place()),
       underlying_allocation_(std::move(underlying_allocation)),
-      owning_stream_(std::move(owning_stream)) {}
+      owning_stream_(std::move(owning_stream)),
+      allocator_(allocator->shared_from_this()) {}
 
 void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
   VLOG(8) << "Try record stream " << stream << " for address " << ptr();
   if (stream == owning_stream_) {
-    VLOG(9) << "Record the same stream of " << stream;
     return;
   }
 
   std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
-  gpuEvent_t record_event;
-  auto it = outstanding_event_map_.find(stream);
-  if (it == outstanding_event_map_.end()) {
-    gpuEvent_t new_event;
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
-#endif
-    outstanding_event_map_[stream] = new_event;
-    record_event = new_event;
-    VLOG(9) << "Create a new event " << new_event;
-  } else {
-    record_event = it->second;
-    VLOG(9) << "Reuse event " << record_event;
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    graph_capturing_stream_set_.insert(stream);
+    return;
   }
-
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
-  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+
+  RecordStreamWithNoGraphCapturing(stream);
+  RecordGraphCapturingStreams();
 }
 
 bool StreamSafeCUDAAllocation::CanBeFreed() {
-  // NOTE(Ruibiao): This function will not execute concurrently,
-  // so outstanding_event_lock_ is not required here
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    return graph_capturing_stream_set_.empty() &&
+           outstanding_event_map_.empty();
+  }
+#endif
+
+  RecordGraphCapturingStreams();
+
   for (auto it = outstanding_event_map_.begin();
        it != outstanding_event_map_.end(); ++it) {
     gpuEvent_t& event = it->second;
@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
   return owning_stream_;
 }
 
+void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() {
+  for (gpuStream_t stream : graph_capturing_stream_set_) {
+    RecordStreamWithNoGraphCapturing(stream);
+  }
+  graph_capturing_stream_set_.clear();
+}
+
+void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
+    const gpuStream_t& stream) {
+  gpuEvent_t record_event;
+  auto it = outstanding_event_map_.find(stream);
+  if (it == outstanding_event_map_.end()) {
+    gpuEvent_t new_event;
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
+#endif
+    outstanding_event_map_[stream] = new_event;
+    record_event = new_event;
+    VLOG(9) << "Create a new event " << new_event;
+  } else {
+    record_event = it->second;
+    VLOG(9) << "Reuse event " << record_event;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
+#endif
+  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+}
+
 StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
     std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
-    gpuStream_t default_stream)
+    gpuStream_t default_stream, bool in_cuda_graph_capturing)
     : underlying_allocator_(std::move(underlying_allocator)),
       place_(std::move(place)),
-      default_stream_(std::move(default_stream)) {
-  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  allocator_map_[place].emplace_back(this);
+      default_stream_(std::move(default_stream)),
+      in_cuda_graph_capturing_(in_cuda_graph_capturing) {
+  if (LIKELY(!in_cuda_graph_capturing)) {
+    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+    allocator_map_[place].emplace_back(this);
+  }
 }
 
 StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
-  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
-  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
-                   allocators.end());
+  if (LIKELY(!in_cuda_graph_capturing_)) {
+    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+    std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
+    allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
+                     allocators.end());
+  }
 }
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   }
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
       static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
-      default_stream_);
+      default_stream_, this);
   VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
           << allocation->ptr();
   return allocation;
@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
                               "StreamSafeCUDAAllocation*",
                               allocation));
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
-  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
     delete stream_safe_cuda_allocation;
   } else {
     VLOG(9) << "Put into unfreed_allocation list";
+    std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
     unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
   }
 }
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  if (UNLIKELY(in_cuda_graph_capturing_)) {
+    VLOG(7) << "Memory release forbidden in CUDA Graph Captruing";
+    return 0;
+  }
+
   std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
   std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
   uint64_t released_size = 0;
   for (StreamSafeCUDAAllocator* allocator : allocators) {
-    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
+    released_size += allocator->ProcessUnfreedAllocationsAndRelease();
   }
   VLOG(8) << "Release " << released_size << " bytes memory from all streams";
   return released_size;
@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
   }
 }
 
-uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
+uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() {
   ProcessUnfreedAllocations();
   return underlying_allocator_->Release(place_);
 }
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 7354836308c..ecddff97c20 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <deque>
 #include <list>
 #include <map>
-#include <mutex>
+#include <set>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,27 +31,38 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class StreamSafeCUDAAllocator;
+
 class StreamSafeCUDAAllocation : public Allocation {
  public:
   StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
-                           gpuStream_t owning_stream);
+                           gpuStream_t owning_stream,
+                           StreamSafeCUDAAllocator *allocator);
+
   void RecordStream(const gpuStream_t &stream);
   bool CanBeFreed();
-
   const gpuStream_t &GetOwningStream() const;
 
  private:
+  void RecordGraphCapturingStreams();
+  void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream);
   DecoratedAllocationPtr underlying_allocation_;
+  std::set<gpuStream_t> graph_capturing_stream_set_;
   std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
   gpuStream_t owning_stream_;
   SpinLock outstanding_event_map_lock_;
+  // To compatiable with CUDA Graph, hold the allocator shared_ptr so that
+  // Allocator will not deconstruct before Allocation
+  std::shared_ptr<Allocator> allocator_;
 };
 
-class StreamSafeCUDAAllocator : public Allocator {
+class StreamSafeCUDAAllocator
+    : public Allocator,
+      public std::enable_shared_from_this<StreamSafeCUDAAllocator> {
  public:
   StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
-                          platform::CUDAPlace place,
-                          gpuStream_t default_stream);
+                          platform::CUDAPlace place, gpuStream_t default_stream,
+                          bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
   bool IsAllocThreadSafe() const override;
 
@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
 
  private:
   void ProcessUnfreedAllocations();
-  uint64_t ProcessUnfreedAllocationsWithRelease();
+  uint64_t ProcessUnfreedAllocationsAndRelease();
 
   static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
       allocator_map_;
@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
   gpuStream_t default_stream_;
   std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
   SpinLock unfreed_allocation_lock_;
+
+  bool in_cuda_graph_capturing_;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index b60bb4fc1d1..2bca2c388a0 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                              stream);
 }
 
+AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                    const phi::Stream& stream) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
+}
+
 bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                   const phi::Stream& stream) {
   return allocation::AllocatorFacade::Instance().InSameStream(allocation,
@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                    const gpuStream_t& stream) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
-}
-
 uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 89b4caa5bed..601fe3f2a42 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size,
                                                const phi::Stream& stream);
 
+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const phi::Stream& stream);
+
 extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                          const phi::Stream& stream);
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                           const gpuStream_t& stream);
-
 extern uint64_t Release(const platform::CUDAPlace& place,
                         const gpuStream_t& stream);
 
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 933717f3090..5e4a4234bb4 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -12,34 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/stream.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 namespace paddle {
 namespace memory {
 
-__global__ void add_kernel(int *x, int n) {
+// y += (x + 1)
+__global__ void add_kernel(int *x, int *y, int n) {
   int thread_num = gridDim.x * blockDim.x;
   int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = thread_id; i < n; i += thread_num) {
-    atomicAdd(x + i, thread_id);
+    y[i] += x[i] + 1;
   }
 }
 
@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
                                  << " there may be a memory leak problem";
 }
 
-class StreamSafeCUDAAllocTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    place_ = platform::CUDAPlace();
-    stream_num_ = 64;
-    grid_num_ = 1;
-    block_num_ = 32;
-    data_num_ = 131072;
-    workspace_size_ = data_num_ * sizeof(int);
-
-    // alloc workspace for each stream
-    for (size_t i = 0; i < stream_num_; ++i) {
-      gpuStream_t stream;
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
-#endif
-
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, workspace_size_,
-                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaMemset(allocation->ptr(), 0, allocation->size()));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          hipMemset(allocation->ptr(), 0, allocation->size()));
-#endif
-
-      streams_.emplace_back(stream);
-      workspaces_.emplace_back(allocation);
-    }
-
-    result_ = Alloc(place_, stream_num_ * workspace_size_);
-  }
-
-  void SingleStreamRun(size_t idx) {
-    // for all stream i,
-    // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
-    for (size_t i = 0; i < stream_num_; ++i) {
-      int *x = reinterpret_cast<int *>(workspaces_[i]->ptr());
-      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
-      RecordStream(workspaces_[i], streams_[idx]);
-    }
-  }
-
-  void CopyResultAsync() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice));
-#endif
-    }
-  }
-
-  void MultiStreamRun() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-      SingleStreamRun(i);
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void MultiThreadMUltiStreamRun() {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads.push_back(
-          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
-    }
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads[i].join();
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void CheckResult() {
-    auto result_host = std::unique_ptr<int[]>(new int[result_->size()]);
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(),
-                                          result_->size(),
-                                          cudaMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(),
-                                         result_->size(),
-                                         hipMemcpyDeviceToHost));
-#endif
-    size_t thread_num = grid_num_ * block_num_;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      for (size_t j = 0; j < data_num_; ++j) {
-        EXPECT_TRUE(result_host[i * stream_num_ + j] ==
-                    (j % thread_num) * stream_num_);
-      }
-    }
-    result_.reset();
-  }
-
-  void TearDown() override {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#endif
-    for (gpuStream_t stream : streams_) {
-      Release(place_, stream);
-    }
-
-    for (size_t i = 1; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
-#endif
-    }
-
-    CheckMemLeak(place_);
-  }
-
-  size_t stream_num_;
-  size_t grid_num_;
-  size_t block_num_;
-  size_t data_num_;
-  size_t workspace_size_;
-  platform::CUDAPlace place_;
-  std::vector<gpuStream_t> streams_;
-  std::vector<std::shared_ptr<Allocation>> workspaces_;
-  allocation::AllocationPtr result_;
-};
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
-  MultiStreamRun();
-  CheckResult();
-}
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
-  MultiThreadMUltiStreamRun();
-  CheckResult();
-}
-
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
           paddle::platform::DeviceContextPool::Instance().Get(place))
           ->stream();
   allocation::AllocationPtr allocation_unique =
-      Alloc(place, alloc_size, default_stream);
+      Alloc(place, alloc_size,
+            phi::Stream(reinterpret_cast<phi::StreamId>(default_stream)));
   EXPECT_GE(allocation_unique->size(), alloc_size);
   EXPECT_EQ(allocation_unique->ptr(), address);
   allocation_unique.reset();
@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
   CheckMemLeak(place);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
-  platform::CUDAPlace place = platform::CUDAPlace();
-  size_t alloc_size = 1;
-  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
-
-  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
-  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(
-      AllocShared(place, alloc_size,
-                  phi::Stream(reinterpret_cast<phi::StreamId>(nullptr))),
-      paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(RecordStream(allocation, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet);
-  platform::EndCUDAGraphCapture();
-
-  allocation.reset();
-  Release(place);
-  CheckMemLeak(place);
-}
-#endif
-
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation1 = Alloc(
+      place, alloc_size, phi::Stream(reinterpret_cast<phi::StreamId>(stream1)));
   allocation::AllocationPtr allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = Alloc(place, alloc_size, stream2);
+    allocation2 = Alloc(place, alloc_size,
+                        phi::Stream(reinterpret_cast<phi::StreamId>(stream2)));
   });
   allocation1.reset();  // free but not release
   th.join();
@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   CheckMemLeak(place);
 }
 
+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 32;
+    data_num_ = 131072;
+    workspace_size_ = data_num_ * sizeof(int);
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
+#endif
+
+      std::shared_ptr<phi::Allocation> workspace_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> result_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> host_result_allocation =
+          AllocShared(platform::CPUPlace(), workspace_size_);
+
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0,
+                                            workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0,
+                                           workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#endif
+
+      streams_.emplace_back(stream);
+      workspaces_.emplace_back(workspace_allocation);
+      results_.emplace_back(result_allocation);
+      host_results_.emplace_back(host_result_allocation);
+    }
+  }
+
+  void SingleStreamRun(size_t idx) {
+    int *y = reinterpret_cast<int *>(results_[idx]->ptr());
+    int neighbouring_idx = idx > 0 ? idx - 1 : idx;
+
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[idx]->ptr()), y, data_num_);
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[neighbouring_idx]->ptr()), y,
+        data_num_);
+    RecordStream(workspaces_[neighbouring_idx], streams_[idx]);
+  }
+
+  void MultiStreamRun() {
+    // Must run in reverse order, or the workspace_[i - 1] will be released
+    // before streams_[i]'s kernel launch
+    for (int i = stream_num_ - 1; i >= 0; --i) {
+      SingleStreamRun(i);
+      workspaces_[i].reset();  // fast GC
+    }
+  }
+
+  void MultiThreadMultiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    workspaces_.clear();
+  }
+
+  void CUDAGraphRun() {
+    testing_cuda_graph_ = true;
+    platform::BeginCUDAGraphCapture(platform::CUDAPlace(),
+                                    cudaStreamCaptureModeGlobal);
+
+    std::shared_ptr<Allocation> data_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+    std::shared_ptr<Allocation> result_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+
+    int *data = static_cast<int *>(data_allocation->ptr());
+    int *result = static_cast<int *>(result_allocation->ptr());
+
+    gpuStream_t main_stream = GetStream(data_allocation);
+    gpuStream_t other_stream;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream));
+
+    add_kernel<<<grid_num_, block_num_, 0, main_stream>>>(data, result,
+                                                          data_num_);
+    RecordStream(data_allocation, other_stream);
+
+    std::unique_ptr<platform::CUDAGraph> cuda_graph =
+        platform::EndCUDAGraphCapture();
+
+    int replay_times = 10;
+    for (int i = 0; i < replay_times; ++i) {
+      cuda_graph->Replay();
+    }
+
+    std::shared_ptr<Allocation> host_result_allocation =
+        AllocShared(platform::CPUPlace(), workspace_size_);
+    Copy(host_result_allocation->place(), host_result_allocation->ptr(),
+         result_allocation->place(), result_allocation->ptr(), workspace_size_,
+         main_stream);
+    cudaStreamSynchronize(main_stream);
+
+    int *host_result = static_cast<int *>(host_result_allocation->ptr());
+    for (int i = 0; i < data_num_; ++i) {
+      EXPECT_EQ(host_result[i], replay_times);
+    }
+
+    data_allocation.reset();
+    result_allocation.reset();
+    cuda_graph.release();
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream));
+  }
+
+  void CheckResult() {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      Copy(host_results_[i]->place(), host_results_[i]->ptr(),
+           results_[i]->place(), results_[i]->ptr(), workspace_size_,
+           streams_[i]);
+    }
+    cudaDeviceSynchronize();
+
+    size_t thread_num = grid_num_ * block_num_;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *result = static_cast<int *>(host_results_[i]->ptr());
+      for (size_t j = 0; j < data_num_; ++j) {
+        EXPECT_EQ(result[j], 2);
+      }
+    }
+  }
+
+  void TearDown() override {
+    workspaces_.clear();
+    results_.clear();
+    host_results_.clear();
+    for (gpuStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
+    }
+
+    // Memory release for CUDA Graph memory pool is forbidden
+    if (!testing_cuda_graph_) {
+      CheckMemLeak(place_);
+    }
+  }
+
+  bool testing_cuda_graph_{0};
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  size_t workspace_size_;
+  platform::CUDAPlace place_;
+  std::vector<gpuStream_t> streams_;
+  std::vector<std::shared_ptr<phi::Allocation>> workspaces_;
+  std::vector<std::shared_ptr<phi::Allocation>> results_;
+  std::vector<std::shared_ptr<phi::Allocation>> host_results_;
+};
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  MultiStreamRun();
+  CheckResult();
+}
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  MultiThreadMultiStreamRun();
+  CheckResult();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
+  MultiStreamRun();
+  CUDAGraphRun();
+  CheckResult();
+}
+#endif
+
 }  // namespace memory
 }  // namespace paddle
-- 
GitLab


From 0969a4eb192e61388eee315dd54469138e1ce1ea Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Thu, 3 Mar 2022 16:22:11 +0800
Subject: [PATCH 098/272] Move compare OPs to phi (#39970)

* Move compare OPs to phi

* Fix bug

* Use BroadcastKernel and ElementwiseKernel in phi
---
 .../operators/controlflow/CMakeLists.txt      |   2 +-
 .../operators/controlflow/compare_all_op.cc   |  81 +--------
 .../operators/controlflow/compare_all_op.cu   |  92 ----------
 .../operators/controlflow/compare_all_op.h    |  43 -----
 .../fluid/operators/controlflow/compare_op.cc |  79 +++------
 .../fluid/operators/controlflow/compare_op.cu |  63 -------
 .../fluid/operators/controlflow/compare_op.h  | 109 ------------
 .../operators/controlflow/compare_op_npu.cc   |   2 +-
 .../operators/controlflow/compare_op_xpu.cc   |   2 +-
 paddle/fluid/operators/matrix_rank_op.cc      |   9 +-
 paddle/fluid/operators/matrix_rank_op.cu      |   5 +-
 paddle/fluid/operators/matrix_rank_op.h       |   1 -
 .../operators/metrics/accuracy_op_npu.cc      |   2 +-
 paddle/fluid/operators/viterbi_decode_op.h    |  31 ++--
 paddle/phi/infermeta/binary.cc                |  49 ++++++
 paddle/phi/infermeta/binary.h                 |   9 +
 paddle/phi/kernels/compare_kernel.h           |  47 ++++++
 paddle/phi/kernels/cpu/compare_kernel.cc      | 143 ++++++++++++++++
 paddle/phi/kernels/funcs/compare_functors.h   |  53 ++++++
 paddle/phi/kernels/gpu/compare_kernel.cu      | 158 ++++++++++++++++++
 paddle/phi/kernels/impl/compare_kernel_impl.h |  81 +++++++++
 paddle/phi/ops/compat/compare_sig.cc          |  56 +++++++
 22 files changed, 654 insertions(+), 463 deletions(-)
 delete mode 100644 paddle/fluid/operators/controlflow/compare_all_op.cu
 delete mode 100644 paddle/fluid/operators/controlflow/compare_all_op.h
 delete mode 100644 paddle/fluid/operators/controlflow/compare_op.cu
 delete mode 100644 paddle/fluid/operators/controlflow/compare_op.h
 create mode 100644 paddle/phi/kernels/compare_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/compare_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/compare_functors.h
 create mode 100644 paddle/phi/kernels/gpu/compare_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/compare_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/compare_sig.cc

diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index a974f2ec335..70937069d97 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
     target_link_libraries(conditional_block_infer_op conditional_block_op)
 endif()
 
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
 file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index ede349f737d..9f229e6f15c 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -12,49 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_all_op.h"
-#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename Functor>
-class CompareReduceOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    Tensor tmp;
-    bool* z_data = z->mutable_data<bool>(context.GetPlace());
-
-    if (x->dims() != y->dims()) {
-      z_data[0] = false;
-    } else {
-      tmp.mutable_data<bool>(x->dims(), context.GetPlace());
-      if (x->numel() == 1 && y->numel() == 1) {
-        bool* z_data = tmp.mutable_data<bool>(context.GetPlace());
-        z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-      } else {
-        ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-            context, x, y, 0, Functor(), &tmp);
-      }
-      auto ipt = framework::EigenVector<bool>::Flatten(tmp);
-      auto out = framework::EigenScalar<bool>::From(*z);
-      auto& place =
-          *context.template device_context<platform::CPUDeviceContext>()
-               .eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      out.device(place) = ipt.all(reduce_dim);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -81,26 +46,6 @@ template <typename OpComment>
 class CompareReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE_EQ(context->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "%s operator must have input X", comment.type));
-    PADDLE_ENFORCE_EQ(context->HasInput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "%s operator must have input Y", comment.type));
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(
-        dim_x.size(), dim_y.size(),
-        platform::errors::InvalidArgument(
-            "The size of dim_y should not be greater than dim_x's."));
-
-    context->SetOutputDim("Out", {1});
-    context->ShareLoD("X", "Out");
-  }
 };
 
 }  // namespace operators
@@ -113,25 +58,13 @@ class CompareReduceOp : public framework::OperatorWithKernel {
   };                                                                       \
   char _##op_type##Comment::type[]{#op_type};                              \
   char _##op_type##Comment::equation[]{_equation};                         \
+  DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
+                              PT_INFER_META(phi::CompareAllInferMeta));    \
   REGISTER_OPERATOR(                                                       \
       op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>,  \
       ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,    \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,   \
+      op_type##_InferShapeFunctor);
 
-#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor)             \
-  REGISTER_OP_CPU_KERNEL(                                                \
-      op_type, ::paddle::operators::CompareReduceOpKernel<               \
-                   ::paddle::platform::CPUDeviceContext, functor<bool>>, \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<int>>,           \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<float>>,         \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<double>>);
 REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y");
-
-REGISTER_COMPARE_REDUCE_CPU_KERNEL(equal_all,
-                                   paddle::operators::EqualReduceFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
deleted file mode 100644
index d96dcebe51f..00000000000
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/fill.h>
-#include "paddle/fluid/operators/controlflow/compare_all_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct BitwiseAdd {
-  // Bitwise add operator, returns <tt>a + b</tt>
-  inline T initial() { return static_cast<T>(true); }
-
-  __host__ __device__ __forceinline__ T operator()(const T& a,
-                                                   const T& b) const {
-    return a & b;
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class CompareReduceOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    bool* z_data = z->mutable_data<bool>(context.GetPlace());
-    Tensor tmp;
-
-    if (x->dims() != y->dims()) {
-      thrust::device_ptr<bool> z_dev_ptr(z_data);
-      thrust::fill(z_dev_ptr, z_dev_ptr + 1, false);
-      return;
-    } else {
-      tmp.mutable_data<bool>(x->dims(), context.GetPlace());
-      const auto& cuda_ctx =
-          context.template device_context<platform::CUDADeviceContext>();
-      std::vector<const framework::Tensor*> ins = {x, y};
-      std::vector<framework::Tensor*> outs = {&tmp};
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
-          cuda_ctx, ins, &outs, Functor());
-
-      // Reduce by 'bitwise and' operator
-      std::vector<int> reduce_dims;
-      reduce_dims.resize(tmp.dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-      auto stream = context.cuda_device_context().stream();
-      TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
-          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
-          reduce_dims, stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor)                  \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_type,                                                                 \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext, ops::functor<bool>>, \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext, ops::functor<int>>,  \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<int64_t>>,                       \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<float>>,                         \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<double>>);
-
-REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor)
-#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h
deleted file mode 100644
index 78a7b76e3fd..00000000000
--- a/paddle/fluid/operators/controlflow/compare_all_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <algorithm>
-#include <type_traits>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EqualReduceFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const {
-    if (std::is_floating_point<T>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return fabs(static_cast<double>(a - b)) < 1e-8;
-    } else {
-      return (a == b);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 657e74398bb..5d9cdc61769 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include <algorithm>
-#include <string>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -60,31 +58,6 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-
-    if (context->GetInputDim("X") == context->GetInputDim("Y")) {
-      context->ShareDim("X", /*->*/ "Out");
-      context->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int max_dim = std::max(dim_x.size(), dim_y.size());
-      int axis = std::abs(dim_x.size() - dim_y.size());
-      std::vector<int> x_dims_array(max_dim);
-      std::vector<int> y_dims_array(max_dim);
-      std::vector<int> out_dims_array(max_dim);
-      GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
-                             y_dims_array.data(), out_dims_array.data(),
-                             max_dim, axis);
-      context->SetOutputDim("Out", phi::make_ddim(out_dims_array));
-      // to do
-      context->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
@@ -116,37 +89,31 @@ class CompareOp : public framework::OperatorWithKernel {
               "In order to force fill output variable to gpu memory.",     \
               false));
 
-#define REGISTER_COMPARE_OP(op_type, _equation)                           \
-  struct _##op_type##Comment {                                            \
-    static char type[];                                                   \
-    static char equation[];                                               \
-  };                                                                      \
-  char _##op_type##Comment::type[]{#op_type};                             \
-  char _##op_type##Comment::equation[]{_equation};                        \
-  REGISTER_OPERATOR(                                                      \
-      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,       \
-      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,      \
-      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); \
+#define REGISTER_COMPARE_OP(op_type, _equation)                          \
+  struct _##op_type##Comment {                                           \
+    static char type[];                                                  \
+    static char equation[];                                              \
+  };                                                                     \
+  char _##op_type##Comment::type[]{#op_type};                            \
+  char _##op_type##Comment::equation[]{_equation};                       \
+  DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
+                              PT_INFER_META(phi::CompareInferMeta));     \
+  REGISTER_OPERATOR(                                                     \
+      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,      \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,     \
+      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,  \
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, \
+      op_type##_InferShapeFunctor);                                      \
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterThanFunctor);
+
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterEqualFunctor);
+
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
-REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessThanFunctor);
+
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
-REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessEqualFunctor);
+
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
-                        paddle::operators::EqualFunctor);
+
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
-                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
deleted file mode 100644
index 4b9452d0f60..00000000000
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor, typename InverseFunctor>
-class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  using InT = typename Functor::ELEM_TYPE;
-  using OutT = bool;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                   InT, OutT>(
-        cuda_ctx, ins, &outs, axis, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_type,                                                                 \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<bool>, void>,    \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int16_t>, void>, \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int>, void>,     \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int64_t>, void>, \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<float>, void>,   \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<double>, void>);
-
-REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor)
-#undef REGISTER_CUDA_COMPARE_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
deleted file mode 100644
index be017a01ef3..00000000000
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define COMPARE_FUNCTOR(func_name, op)                           \
-  template <typename InT, typename OutT = bool>                  \
-  struct func_name {                                             \
-    using ELEM_TYPE = InT;                                       \
-    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
-      return static_cast<OutT>(a op b);                          \
-    }                                                            \
-  };
-
-COMPARE_FUNCTOR(LessThanFunctor, <)
-COMPARE_FUNCTOR(LessEqualFunctor, <=)
-COMPARE_FUNCTOR(GreaterThanFunctor, >)
-COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
-#undef COMPARE_FUNCTOR
-
-template <typename InT, typename OutT = bool>
-struct EqualFunctor {
-  using ELEM_TYPE = InT;
-  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
-    if (std::is_floating_point<InT>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
-    } else {
-      return static_cast<OutT>(a == b);
-    }
-  }
-};
-
-template <typename InT, typename OutT = bool>
-struct NotEqualFunctor {
-  using ELEM_TYPE = InT;
-  HOSTDEVICE bool operator()(const InT a, const InT b) const {
-    return !EqualFunctor<InT, OutT>()(a, b);
-  }
-};
-
-template <typename DeviceContext, typename Functor, typename InverseFunctor>
-class CompareOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                            Functor(), z);
-    } else {
-      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
-          context, x, y, axis, InverseFunctor(), z);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
-  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<bool>, inverse_functor<bool>>,       \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int>, inverse_functor<int>>,         \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int16_t>, inverse_functor<int16_t>>, \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int64_t>, inverse_functor<int64_t>>, \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<float>, inverse_functor<float>>,     \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
index 7bc4ca09771..7377d7cf8d3 100644
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
index 698bd051613..2de8b4c9ba8 100644
--- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index 65599259e22..1f04875c220 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -224,15 +225,15 @@ class MatrixRankCPUKernel : public framework::OpKernel<T> {
 
     int axis = -1;
     if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
-      ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
+      ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
                            platform::CPUDeviceContext, T, int>(
           context, &eigenvalue_tensor, &tol_tensor, axis,
-          GreaterThanFunctor<T, int64_t>(), &compare_result);
+          phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
     } else {
-      ElementwiseComputeEx<LessThanFunctor<T, int64_t>,
+      ElementwiseComputeEx<phi::funcs::LessThanFunctor<T, int64_t>,
                            platform::CPUDeviceContext, T, int>(
           context, &eigenvalue_tensor, &tol_tensor, axis,
-          LessThanFunctor<T, int64_t>(), &compare_result);
+          phi::funcs::LessThanFunctor<T, int64_t>(), &compare_result);
     }
     auto dito_int =
         math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index b1800c9c0c9..dccd716022d 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -129,10 +130,10 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
     compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
                                          context.GetPlace());
     int axis = -1;
-    ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
+    ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
                          platform::CUDADeviceContext, T, int64_t>(
         context, &eigenvalue_tensor, &tol_tensor, axis,
-        GreaterThanFunctor<T, int64_t>(), &compare_result);
+        phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
     auto dito_int =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
                                                  int64_t>(context);
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/fluid/operators/matrix_rank_op.h
index 80774aa9169..93545fd3103 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/fluid/operators/matrix_rank_op.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/controlflow/compare_op.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index 63bccc2e6e0..e83278f88b8 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -12,7 +12,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index 0974177e6c7..e7fe743b964 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -14,12 +14,13 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/gather.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
@@ -353,8 +354,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     BinaryOperation<DeviceContext, SubFunctor, int64_t> SubInt;
     if (include_bos_eos_tag) {
       AddFloat(dev_ctx, logit0, start_trans, &alpha);
-      GetMask<DeviceContext, EqualFunctor, T>()(ctx, left_length, one,
-                                                &float_mask);
+      GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
+                                                            one, &float_mask);
       MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
       AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
     } else {
@@ -375,8 +376,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
       alpha.Resize({batch_size, n_labels});
       // mask = paddle.cast((left_length > 0), dtype='float32')
       // alpha = mask * alpha_nxt + (1 - mask) * alpha
-      GetMask<DeviceContext, GreaterThanFunctor, T>()(ctx, left_length, zero,
-                                                      &float_mask);
+      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, T>()(
+          ctx, left_length, zero, &float_mask);
       // alpha_nxt = mask * alpha_nxt
       MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
       // inv_mask = 1 - mask
@@ -386,8 +387,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
       // alpha += alpha_nxt
       AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
       if (include_bos_eos_tag) {
-        GetMask<DeviceContext, EqualFunctor, T>()(ctx, left_length, one,
-                                                  &float_mask);
+        GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
+                                                              one, &float_mask);
         // alpha += mask * trans_exp[:, self.stop_idx]
         MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
         AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
@@ -396,8 +397,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     }
     argmax(ctx, alpha, &last_ids, scores, 1);
     left_length.Resize({batch_size});
-    GetMask<DeviceContext, GreaterEqualFunctor, int64_t>()(ctx, left_length,
-                                                           zero, &int_mask);
+    GetMask<DeviceContext, phi::funcs::GreaterEqualFunctor, int64_t>()(
+        ctx, left_length, zero, &int_mask);
     // last_ids_update = last_ids * tag_mask
     int last_ids_index = 1;
     int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
@@ -416,17 +417,17 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
           batch_path[actual_len - last_ids_index];
       hist->Resize({batch_size * n_labels});
       gather(dev_ctx, *hist, gather_idx, &last_ids_update);
-      GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,
-                                                            zero, &int_mask);
+      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, int64_t>()(
+          ctx, left_length, zero, &int_mask);
       MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
-      GetMask<DeviceContext, EqualFunctor, int64_t>()(ctx, left_length, zero,
-                                                      &zero_len_mask);
+      GetMask<DeviceContext, phi::funcs::EqualFunctor, int64_t>()(
+          ctx, left_length, zero, &zero_len_mask);
       MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
       SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
       MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
       AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
-      GetMask<DeviceContext, LessThanFunctor, int64_t>()(ctx, left_length, zero,
-                                                         &int_mask);
+      GetMask<DeviceContext, phi::funcs::LessThanFunctor, int64_t>()(
+          ctx, left_length, zero, &int_mask);
       MulInt(dev_ctx, last_ids, int_mask, &last_ids);
       AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
     }
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 7682f6b3d49..1f6f0b211b6 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -13,11 +13,60 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/binary.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
 
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+
+  if (dim_x == dim_y) {
+    out->share_meta(x);
+  } else {
+    int max_dim = std::max(dim_x.size(), dim_y.size());
+    int axis = std::abs(dim_x.size() - dim_y.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    funcs::GetBroadcastDimsArrays(dim_x,
+                                  dim_y,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
+
+    out->set_dims(make_ddim(out_dims_array));
+    out->share_lod(x);
+  }
+
+  out->set_dtype(DataType::BOOL);
+}
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  PADDLE_ENFORCE_GE(
+      dim_x.size(),
+      dim_y.size(),
+      errors::InvalidArgument(
+          "The size of dim_y should not be greater than dim_x's."));
+  out->share_lod(x);
+  out->set_dims(make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_dims = x.dims();
   auto x_rank = static_cast<size_t>(x_dims.size());
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 5906e06b293..47745f8ce13 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -29,6 +29,15 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out);
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out);
+
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void MatmulInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/compare_kernel.h b/paddle/phi/kernels/compare_kernel.h
new file mode 100644
index 00000000000..5b6b8cd868f
--- /dev/null
+++ b/paddle/phi/kernels/compare_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DECALRE_COMPARE_KERNEL(compare_kernel) \
+  template <typename T, typename Context>      \
+  void compare_kernel(const Context& ctx,      \
+                      const DenseTensor& x,    \
+                      const DenseTensor& y,    \
+                      int axis,                \
+                      DenseTensor* out);
+
+DECALRE_COMPARE_KERNEL(LessThanKernel)
+DECALRE_COMPARE_KERNEL(LessEqualKernel)
+DECALRE_COMPARE_KERNEL(GreaterThanKernel)
+DECALRE_COMPARE_KERNEL(GreaterEqualKernel)
+DECALRE_COMPARE_KERNEL(EqualKernel)
+DECALRE_COMPARE_KERNEL(NotEqualKernel)
+#undef DECALRE_COMPARE_KERNEL
+
+#define DECALRE_COMPARE_ALL_KERNEL(compare_all_kernel) \
+  template <typename T, typename Context>              \
+  void compare_all_kernel(const Context& ctx,          \
+                          const DenseTensor& x,        \
+                          const DenseTensor& y,        \
+                          DenseTensor* out);
+
+DECALRE_COMPARE_ALL_KERNEL(EqualAll)
+#undef DECALRE_COMPARE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
new file mode 100644
index 00000000000..9006325a521
--- /dev/null
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/compare_kernel.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out) {
+  ctx.template Alloc<bool>(out);
+  if (x.dims().size() >= y.dims().size()) {
+    funcs::ElementwiseCompute<Functor, T, bool>(
+        ctx, x, y, axis, Functor(), out);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T, bool>(
+        ctx, x, y, axis, InverseFunctor(), out);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  bool* out_data = ctx.template Alloc<bool>(out);
+
+  if (x.dims() != y.dims()) {
+    out_data[0] = false;
+  } else {
+    DenseTensor tmp;
+    tmp.Resize(x.dims());
+    ctx.template Alloc<bool>(&tmp);
+
+    if (x.numel() == 1 && y.numel() == 1) {
+      bool* tmp_data = tmp.data<bool>();
+      tmp_data[0] = Functor()(x.data<T>()[0], y.data<T>()[0]);
+    } else {
+      funcs::ElementwiseCompute<Functor, T, bool>(
+          ctx, x, y, 0, Functor(), &tmp);
+    }
+    auto tmp_flat = EigenVector<bool>::Flatten(tmp);
+    auto out_es = EigenScalar<bool>::From(*out);
+    auto& place = *ctx.eigen_device();
+    auto reduce_dim = Eigen::array<int, 1>({{0}});
+    out_es.device(place) = tmp_flat.all(reduce_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(less_than,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LessThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(less_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LessEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_than,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GreaterThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GreaterEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(not_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NotEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(equal_all,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EqualAllKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/funcs/compare_functors.h b/paddle/phi/kernels/funcs/compare_functors.h
new file mode 100644
index 00000000000..569fed7b7fb
--- /dev/null
+++ b/paddle/phi/kernels/funcs/compare_functors.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define COMPARE_FUNCTOR(func_name, op)                           \
+  template <typename InT, typename OutT = bool>                  \
+  struct func_name {                                             \
+    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
+      return static_cast<OutT>(a op b);                          \
+    }                                                            \
+  };
+
+COMPARE_FUNCTOR(LessThanFunctor, <)
+COMPARE_FUNCTOR(LessEqualFunctor, <=)
+COMPARE_FUNCTOR(GreaterThanFunctor, >)
+COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
+#undef COMPARE_FUNCTOR
+
+template <typename InT, typename OutT = bool>
+struct EqualFunctor {
+  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
+    if (std::is_floating_point<InT>::value) {
+      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
+    } else {
+      return static_cast<OutT>(a == b);
+    }
+  }
+};
+
+template <typename InT, typename OutT = bool>
+struct NotEqualFunctor {
+  HOSTDEVICE bool operator()(const InT a, const InT b) const {
+    return !EqualFunctor<InT, OutT>()(a, b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu
new file mode 100644
index 00000000000..272448504ac
--- /dev/null
+++ b/paddle/phi/kernels/gpu/compare_kernel.cu
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/compare_kernel.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+
+#include <thrust/fill.h>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+struct BitwiseAdd {
+  // Bitwise add operator, returns <tt>a + b</tt>
+  inline T initial() { return static_cast<T>(true); }
+
+  __host__ __device__ __forceinline__ T operator()(const T& a,
+                                                   const T& b) const {
+    return a & b;
+  }
+};
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out) {
+  ctx.template Alloc<bool>(out);
+  std::vector<const DenseTensor*> ins{&x, &y};
+  std::vector<DenseTensor*> outs{out};
+  funcs::BroadcastKernel<ElementwiseType::kBinary, T, bool>(
+      ctx, ins, &outs, axis, Functor());
+}
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  bool* out_data = ctx.template Alloc<bool>(out);
+
+  if (x.dims() != y.dims()) {
+    thrust::device_ptr<bool> out_dev_ptr(out_data);
+    thrust::fill(out_dev_ptr, out_dev_ptr + 1, false);
+    return;
+  }
+
+  DenseTensor tmp;
+  tmp.Resize(x.dims());
+  ctx.template Alloc<bool>(&tmp);
+
+  std::vector<const DenseTensor*> ins{&x, &y};
+  std::vector<DenseTensor*> outs{&tmp};
+  funcs::ElementwiseKernel<bool>(ctx, ins, &outs, Functor());
+
+  // Reduce by 'bitwise and' operator
+  std::vector<int> reduce_dims;
+  reduce_dims.resize(tmp.dims().size());
+  for (int i = 0; i < reduce_dims.size(); ++i) {
+    reduce_dims[i] = i;
+  }
+  kernels::TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
+      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims, ctx.stream());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(less_than,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LessThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(less_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LessEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_than,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GreaterThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GreaterEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(not_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NotEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(equal_all,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EqualAllKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
new file mode 100644
index 00000000000..4390c1f8e66
--- /dev/null
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/compare_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+
+namespace phi {
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out);
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out);
+
+#define DEFINE_COMPARE_KERNEL(compare_kernel, functor, inverse_functor) \
+  template <typename T, typename Context>                               \
+  void compare_kernel(const Context& ctx,                               \
+                      const DenseTensor& x,                             \
+                      const DenseTensor& y,                             \
+                      int axis,                                         \
+                      DenseTensor* out) {                               \
+    CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>(      \
+        ctx, x, y, axis, out);                                          \
+  }
+
+DEFINE_COMPARE_KERNEL(LessThanKernel,
+                      funcs::LessThanFunctor,
+                      funcs::GreaterThanFunctor)
+DEFINE_COMPARE_KERNEL(LessEqualKernel,
+                      funcs::LessEqualFunctor,
+                      funcs::GreaterEqualFunctor)
+DEFINE_COMPARE_KERNEL(GreaterThanKernel,
+                      funcs::GreaterThanFunctor,
+                      funcs::LessThanFunctor)
+DEFINE_COMPARE_KERNEL(GreaterEqualKernel,
+                      funcs::GreaterEqualFunctor,
+                      funcs::LessEqualFunctor)
+DEFINE_COMPARE_KERNEL(EqualKernel, funcs::EqualFunctor, funcs::EqualFunctor)
+DEFINE_COMPARE_KERNEL(NotEqualKernel,
+                      funcs::NotEqualFunctor,
+                      funcs::NotEqualFunctor)
+#undef DEFINE_COMPARE_KERNEL
+
+#define DEFINE_COMPARE_ALL_KERNEL(compare_all_kernel, functor)    \
+  template <typename T, typename Context>                         \
+  void compare_all_kernel(const Context& ctx,                     \
+                          const DenseTensor& x,                   \
+                          const DenseTensor& y,                   \
+                          DenseTensor* out) {                     \
+    CompareAllKernelImpl<T, Context, functor<T>>(ctx, x, y, out); \
+  }
+
+DEFINE_COMPARE_ALL_KERNEL(EqualAllKernel, funcs::EqualFunctor)
+#undef DEFINE_COMPARE_ALL_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/compare_sig.cc b/paddle/phi/ops/compat/compare_sig.cc
new file mode 100644
index 00000000000..964c7be3db3
--- /dev/null
+++ b/paddle/phi/ops/compat/compare_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LessThanArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("less_than", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature LessEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("less_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature GreaterThanArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("greater_than", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature GreaterEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("greater_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature EqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature NotEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("not_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature EqualAllArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("equal_all", {"X", "Y"}, {}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(less_than, phi::LessThanArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(less_equal, phi::LessEqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(greater_than, phi::GreaterThanArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(greater_equal, phi::GreaterEqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(equal, phi::EqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(not_equal, phi::NotEqualArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(equal_all, phi::EqualAllArgumentMapping);
-- 
GitLab


From b1d38deafc3228acc3a06053e8e0359da617e659 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 3 Mar 2022 16:46:54 +0800
Subject: [PATCH 099/272] mlir attr types for infrt place, test=develop
 (#40087)

* mlir attr types for infrt place, test=develop

* fix a bug, test=develop
---
 paddle/infrt/dialect/infrt/common_type.cc     | 33 +++++++------
 paddle/infrt/dialect/infrt/common_type.h      | 11 +++--
 paddle/infrt/dialect/infrt/infrt_dialect.cc   |  1 +
 paddle/infrt/dialect/infrt/infrt_ops_base.td  | 46 ++++++++++++++++++-
 paddle/infrt/dialect/phi/ir/infrt_phi_base.td | 38 +++++++--------
 .../infrt/dialect/phi/ir/infrt_phi_tensor.td  | 10 ++--
 paddle/infrt/dialect/phi/ir/phi_base.cc       | 40 +---------------
 paddle/infrt/dialect/phi/ir/phi_base.h        |  7 +--
 paddle/infrt/host_context/kernel_frame.cc     | 37 ++++++---------
 .../phi/infershaped/phi_kernel_launcher.h     | 20 --------
 paddle/infrt/kernel/phi/registry.cc           | 11 -----
 .../tests/dialect/pten/dense_tensor.mlir      |  8 ++--
 tools/infrt/generate_phi_kernel_dialect.py    |  2 +-
 13 files changed, 121 insertions(+), 143 deletions(-)

diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common_type.cc
index 5cbd7b2cd61..00684c50526 100644
--- a/paddle/infrt/dialect/infrt/common_type.cc
+++ b/paddle/infrt/dialect/infrt/common_type.cc
@@ -43,46 +43,49 @@ llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
     return llvm::None;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) {
+llvm::StringRef GetString(TargetType type) {
+  llvm::StringRef str;
   switch (type) {
     case (TargetType::CPU):
-      os << "CPU";
+      str = "CPU";
       break;
     case (TargetType::GPU):
-      os << "GPU";
+      str = "GPU";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) {
+llvm::StringRef GetString(LayoutType type) {
+  llvm::StringRef str;
   switch (type) {
     case (LayoutType::NCHW):
-      os << "NCHW";
+      str = "NCHW";
       break;
     case (LayoutType::NHWC):
-      os << "NHWC";
+      str = "NHWC";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) {
+llvm::StringRef GetString(PrecisionType type) {
+  llvm::StringRef str;
   switch (type) {
     case (PrecisionType::FLOAT32):
-      os << "FP32";
+      str = "FP32";
       break;
     case (PrecisionType::FLOAT16):
-      os << "FP16";
+      str = "FP16";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h
index 436e7920ca5..2ebe2b8ccdb 100644
--- a/paddle/infrt/dialect/infrt/common_type.h
+++ b/paddle/infrt/dialect/infrt/common_type.h
@@ -54,8 +54,13 @@ llvm::Optional<TargetType> GetTargetType(llvm::StringRef key);
 llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key);
 llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key);
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type);
+llvm::StringRef GetString(TargetType type);
+llvm::StringRef GetString(LayoutType type);
+llvm::StringRef GetString(PrecisionType type);
 
+template <typename T>
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, T type) {
+  os << GetString(type);
+  return os;
+}
 }  // end namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc
index abb60016f90..400e4921c94 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/DialectImplementation.h>
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index f19912dc0cd..8a6eb766567 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -10,16 +10,59 @@ def Infrt_Dialect : Dialect {
 
   let name = "infrt";
   let cppNamespace = "::infrt";
+  let useDefaultAttributePrinterParser = 1;
 }
 
 // Type definitions
-
 // Base class for Infrt dialect types.
 class Infrt_Type<string name, list<Trait> traits = [],
                    string baseCppClass = "::mlir::Type">
     : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
 }
 
+class Infrt_EnumParam<string cppEnumType, string stringToSymbolFnName,
+  string symbolToStringFnName, string desc = ""> : TypeParameter<cppEnumType, desc> {
+  let parser = [{[&]() -> ::mlir::FailureOr<}] # cppEnumType # [{> {
+    ::llvm::StringRef enumKeyword;
+    if (::mlir::failed($_parser.parseKeyword(&enumKeyword)))
+      return ::mlir::failure();
+    auto maybeEnum = }] # stringToSymbolFnName # [{(enumKeyword);
+    if (maybeEnum)
+      return *maybeEnum;
+    llvm_unreachable("}] # cppEnumType # [{ can not be found.");
+    return {};
+  }()}];
+  let printer = "$_printer << " # symbolToStringFnName # "($_self)";
+}
+
+def TargetParam : Infrt_EnumParam<"::infrt::TargetType", "GetTargetType", "GetString">;
+def PrecisionParam : Infrt_EnumParam<"::infrt::PrecisionType", "GetPrecisionType", "GetString">;
+def LayoutParam : Infrt_EnumParam<"::infrt::LayoutType", "GetLayoutType", "GetString">;
+
+def TargetAttr : AttrDef<Infrt_Dialect, "Target"> {
+  let mnemonic = "target";
+  let parameters = (ins
+    TargetParam:$target
+  );
+  let assemblyFormat = "`<` $target `>`";
+}
+
+def PrecisionAttr : AttrDef<Infrt_Dialect, "Precision"> {
+  let mnemonic = "precision";
+  let parameters = (ins
+    PrecisionParam:$precision
+  );
+  let assemblyFormat = "`<` $precision `>`";
+}
+
+def LayoutAttr : AttrDef<Infrt_Dialect, "Layout"> {
+  let mnemonic = "layout";
+  let parameters = (ins
+    LayoutParam:$layout
+  );
+  let assemblyFormat = "`<` $layout `>`";
+}
+
 def LoDTensor : Infrt_Type<"LoDTensor"> {
   let summary = "infrt lod tensor";
   let description = [{lod_tensor<3x64x3x3xf32, 3>}];
@@ -37,7 +80,6 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
     "::infrt::TargetType":$target,
     "::infrt::PrecisionType":$precision,
     "::infrt::LayoutType":$layout
-
   );
 }
 
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
index 907f912d9e6..e9591e7f6d7 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -2,6 +2,7 @@
 #define PHI_BASE
 
 include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
 
 def PHI_Dialect : Dialect {
   let name = "phi";
@@ -11,27 +12,28 @@ def PHI_Dialect : Dialect {
   }];
 
   let cppNamespace = "::infrt::phi";
-}
-
-class AllocatorTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PHI_Dialect, place # "Allocator", traits> {
-    let summary = !strconcat("!phi.allocator_", place, " type");
-}
-
-class ContextTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PHI_Dialect, place # "Context", traits> {
-    let summary = !strconcat("!phi.context_", place, " type");
+  let useDefaultTypePrinterParser = 1;
 }
 
 def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
 
-def CPU_Allocator : AllocatorTypeOf<"CPU">;
-def GPU_Allocator : AllocatorTypeOf<"GPU">;
-
-def CPU_Context : ContextTypeOf<"CPU">;
-def GPU_Context : ContextTypeOf<"GPU">;
-
-def Allocator : AnyTypeOf<[CPU_Allocator, GPU_Allocator], "Allocator type">;
-def Context : AnyTypeOf<[CPU_Context, GPU_Context], "Context type">;
+class PHI_Type<string type, list<Trait> traits = []>
+   : TypeDef<PHI_Dialect, type, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
+
+def Allocator : PHI_Type<"Allocator"> {
+   let mnemonic = "allocator";
+   let parameters = (ins
+     TargetParam:$target
+   );
+   let assemblyFormat = "`<` $target `>`";
+ }
+
+ def Context : PHI_Type<"Context"> {
+   let mnemonic = "context";
+   let parameters = (ins
+     TargetParam:$target
+   );
+   let assemblyFormat = "`<` $target `>`";
+ }
 
 #endif
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 39677871ff8..3399c408d9b 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -23,7 +23,7 @@ class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDia
 
 class CreateDenseTensorOp<string place, string dtype, string layout> 
       : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
-  let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
+  let arguments = (ins Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
   let results = (outs DenseTensor:$output);
 }
 
@@ -47,13 +47,13 @@ class PrintDenseTensorOp:
 class CreateCPUAllocatorOp
       : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> {
   let arguments = (ins);
-  let results = (outs CPU_Allocator:$output);
+  let results = (outs Allocator:$output);
 }
 
 class CreateCPUContextOp
       : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins CPU_Allocator:$input);
-  let results = (outs CPU_Context:$output);
+  let arguments = (ins Allocator:$input);
+  let results = (outs Context:$output);
 }
 
 def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">;
@@ -63,7 +63,7 @@ def PDT_CreateContextOp_cpu : CreateCPUContextOp;
 def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
+  let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
   let results = (outs DenseTensor:$output);
 }
 
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index 7a6b3f3f0a4..d8095d7f3f1 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
+#include <llvm/include/llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
@@ -27,27 +28,6 @@
 namespace infrt {
 namespace phi {
 
-void PHIDialect::printType(::mlir::Type type,
-                           mlir::DialectAsmPrinter& os) const {
-  if (type.isa<CPUAllocatorType>()) {
-    os << "CPU_Allocator";
-    return;
-  }
-  if (type.isa<GPUAllocatorType>()) {
-    os << "GPU_Allocator";
-    return;
-  }
-  if (type.isa<CPUContextType>()) {
-    os << "CPU_Context";
-    return;
-  }
-  if (type.isa<GPUContextType>()) {
-    os << "GPU_Context";
-    return;
-  }
-  llvm_unreachable("unexpected 'allocator/context' type kind");
-}
-
 void PHIDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
@@ -59,24 +39,6 @@ void PHIDialect::initialize() {
       >();
 }
 
-mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
-  llvm::StringRef keyword;
-  if (parser.parseKeyword(&keyword)) return mlir::Type();
-  if (keyword == "CPU_allocator") {
-    return CPUAllocatorType::get(parser.getContext());
-  } else if (keyword == "GPU_allocator") {
-    return GPUAllocatorType::get(parser.getContext());
-  } else if (keyword == "CPU_context") {
-    return CPUContextType::get(parser.getContext());
-  } else if (keyword == "GPU_context") {
-    return GPUContextType::get(parser.getContext());
-  } else {
-    llvm_unreachable("unexpected 'allocator/context' type kind");
-  }
-
-  return mlir::Type();
-}
-
 }  // namespace phi
 }  // namespace infrt
 
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
index a08d8229fcc..0ea1973a733 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,12 +18,10 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
+#include "paddle/infrt/dialect/infrt/common_type.h"
 
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
-#define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
-
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
 
@@ -41,6 +39,9 @@ class PhiOpTrait : public OpTrait::TraitBase<ConcreteType, PhiOpTrait> {
 }  // namespace OpTrait
 }  // namespace mlir
 
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
+
 namespace infrt {
 namespace phi {}  // namespace phi
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
index 14e88be4b96..266c145f478 100644
--- a/paddle/infrt/host_context/kernel_frame.cc
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -30,28 +30,21 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
 std::string KernelFrame::DumpArgTypes() const {
   std::stringstream ss;
   for (auto* value : GetValues(0, GetNumElements())) {
-    if (value->is_type<bool>()) {
-      ss << "bool (" << &value->get<bool>() << "), ";
-    } else if (value->is_type<tensor::DenseHostTensor>()) {
-      ss << "DenseHostTensor(" << &value->get<tensor::DenseHostTensor>()
-         << "), ";
-    } else if (value->is_type<float>()) {
-      ss << "float(" << &value->get<float>() << "), ";
-    } else if (value->is_type<int>()) {
-      ss << "int(" << &value->get<int>() << "), ";
-    } else if (value->is_type<phi::DenseTensor>()) {
-      ss << "phi::DenseTensor(" << &value->get<phi::DenseTensor>() << "), ";
-    } else if (value->is_type<phi::MetaTensor>()) {
-      ss << "phi::MetaTensor(" << &value->get<phi::MetaTensor>() << "), ";
-    } else if (value->is_type<::phi::CPUContext>()) {
-      ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), ";
-    } else if (value->is_type<host_context::None>()) {
-      ss << "none(" << &value->get<host_context::None>() << "), ";
-    } else if (value->is_type<backends::CpuPhiContext>()) {
-      ss << "CpuPhiContext(" << &value->get<backends::CpuPhiContext>() << "), ";
-    } else {
-      ss << "typeid: " << value->index() << ", ";
-    }
+#define DUMP(type_name)                                    \
+  if (value->is_type<type_name>()) {                       \
+    ss << #type_name << &value->get<type_name>() << "), "; \
+  }
+    DUMP(bool);
+    DUMP(tensor::DenseHostTensor);
+    DUMP(float);
+    DUMP(int);
+    DUMP(::phi::DenseTensor);
+    DUMP(::phi::MetaTensor);
+    DUMP(::phi::CPUContext);
+    DUMP(host_context::None);
+    DUMP(backends::CpuPhiContext);
+#undef DUMP
+    ss << "typeid: " << value->index() << ", ";
   }
   return ss.str();
 }
diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index a0a5b391ea6..75c9e554778 100644
--- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -24,26 +24,6 @@
 namespace infrt {
 namespace kernel {
 
-static void FakePhiInferShape(const ::phi::MetaTensor& a,
-                              const ::phi::MetaTensor& b,
-                              bool arg_0,
-                              bool arg_1,
-                              ::phi::MetaTensor* c) {
-  LOG(INFO) << "the ptr of c: " << c;
-  LOG(INFO) << "c->numel(): " << c->numel();
-}
-
-static void FakePhiKernel(const ::phi::CPUContext& /*Context*/,
-                          const ::phi::DenseTensor& a,
-                          const ::phi::DenseTensor& b,
-                          bool arg_0,
-                          bool arg_1,
-                          ::phi::DenseTensor* c) {
-  std::cout << "@FakePhiKernel@" << std::endl;
-  LOG(INFO) << "the ptr of c: " << c;
-  LOG(INFO) << "c->numel(): " << c->numel();
-}
-
 template <typename KernelFunc,
           KernelFunc kernel,
           typename InferShapedFunc,
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 15e2d21005e..cb09275c170 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -44,17 +44,6 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
   registry->AddKernel("phi_dt.print_tensor",
                       INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
-  registry->AddKernel(
-      "phi_dt.fake_phi_kernel",
-      std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
-                                    &FakePhiKernel,
-                                    decltype(&FakePhiInferShape),
-                                    &FakePhiInferShape>,
-                KernelLauncher<decltype(&FakePhiKernel),
-                               &FakePhiKernel,
-                               decltype(&FakePhiInferShape),
-                               &FakePhiInferShape>(),
-                std::placeholders::_1));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
index 695143c93b3..586af7a9c50 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
@@ -2,11 +2,11 @@
 
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
-  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
-  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context
-  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.allocator<CPU>
+  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator<CPU>) -> !phi.context<CPU>
+  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // CHECK: dense_tensor: shape=shape[1], values=[1]
   "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 80cf3958b15..8efa03306fb 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -95,7 +95,7 @@ def generate_inputs_info(input_info):
 def generate_arguments_info(op_name, input_info, attr_info):
     input_args = generate_inputs_info(input_info)
     attr_args = generate_attrs_info(op_name, attr_info)
-    context_args = "CPU_Context:$dev_ctx"
+    context_args = "Context:$dev_ctx"
     argument_ = "{},{},{}".format(context_args, input_args, attr_args)
     return (("let arguments = (ins {});".format(argument_.strip(","))))
 
-- 
GitLab


From d8b4022389aaaa9d46acb43f2846ad2b823a6ad7 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Thu, 3 Mar 2022 17:29:56 +0800
Subject: [PATCH 100/272] fix_trt_engine_op_bug (#40067)

---
 paddle/fluid/inference/api/analysis_predictor.cc | 4 +++-
 paddle/fluid/inference/api/analysis_predictor.h  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5492c3b0d26..df61b510319 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -80,6 +80,8 @@ using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
 #endif
 
+int AnalysisPredictor::clone_num_ = 1;
+
 namespace {
 bool IsPersistable(const framework::VarDesc *var) {
   if (var->Persistable() &&
@@ -1633,7 +1635,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
-  x->executor_->ResetTrtOps(++x->clone_num_);
+  x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 8ed183dae0b..21a7e9658bb 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -486,7 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
-  int clone_num_{1};
+  static int clone_num_;
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-- 
GitLab


From 167d511f074633992764f51c3be416a0d9169ff7 Mon Sep 17 00:00:00 2001
From: TeFeng Chen <ctfeng66@163.com>
Date: Thu, 3 Mar 2022 19:01:38 +0800
Subject: [PATCH 101/272] cinn_launch_op: switch to execution by PE (#39911)

* swith to PE execution in cinn launch

* fix outer variables erased

* skip the map bug temporarily for test

* temporary solution for batch_norm bug

* update comment

* fix compile error

* cinn_instruction_run_op_test: update code to skip external alloc/free instructions generated
---
 .../framework/paddle2cinn/cinn_compiler.cc    |  1 -
 paddle/fluid/operators/cinn/CMakeLists.txt    |  6 +--
 .../cinn/cinn_instruction_run_op_test.cc      |  2 +-
 .../operators/cinn/cinn_launch_context.cc     | 46 +++++++++++++++++--
 .../operators/cinn/cinn_launch_context.h      | 10 ++++
 paddle/fluid/operators/cinn/cinn_launch_op.h  | 21 ++++-----
 .../operators/cinn/cinn_launch_op_test.cc     |  4 ++
 7 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 706815185a1..c015e90f71e 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -241,7 +241,6 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
-  options.with_buffer_handle_instruction_inserted = true;
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index f1247ebdf23..2092f65212a 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,9 +1,9 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn)
 
-SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
+SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
@@ -11,7 +11,7 @@ if (WITH_TESTING)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
-  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags)
   set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}")
 
   cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op)
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 7c4bdc09a56..2afee35112e 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -50,7 +50,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) {
   auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp(
       "cinn_instruction_run", {{"X", {"x", "y"}}},
       {{"Out", {test_op_out_name}}},
-      {{"cached_index", 0}, {"instruction_index", 1}});
+      {{"cached_index", 0}, {"instruction_index", 0}});
   auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
       "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
       {{"Out", {add_op_out_name}}}, {{}});
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0a21d937aa1..b76dd604092 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -31,6 +31,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
@@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
   // Convert the CINN runtime program to a Paddle graph
   runtime_graph_ = std::make_unique<framework::ir::Graph>(
       BuildCompiledProgram(graph, compiled_obj));
-  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
-      kMemOptVarInfoFromMainGraph,
-      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
+  auto& outer_varinfo = graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph,
+                                               &outer_varinfo);
+  // collect skip_eager_vars
+  skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size());
+  auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) {
+    // if a var exists at outer_varinfo map,
+    // that means it can be erased after graph execution
+    if (!outer_varinfo.count(var_name)) {
+      skip_eager_vars_.emplace_back(var_name);
+    }
+  };
+  std::for_each(input_var_names.begin(), input_var_names.end(),
+                add_skip_var_fn);
+  std::for_each(output_var_names.begin(), output_var_names.end(),
+                add_skip_var_fn);
+  VLOG(4) << string::Sprintf(
+      "Distribution of variables in the graph compiled:"
+      "input[%lu],internal[%lu],output[%lu],"
+      "outer_eager_deletion[%lu],skip_eager_deletion[%lu],"
+      "initialized_beforehand[%lu]",
+      input_var_names.size(), internal_var_names_.size(),
+      output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(),
+      initialized_beforehand_vars_.size());
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
   //   are set by values of the corresponding compiled tensors,
   //   including the in/out variables where the equiality between their tensors
   //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  std::unordered_set<std::string> has_refer_vars;
   for (auto&& arg : cinn_argument_names_) {
     const std::string& var_name = cinn2paddle_varmap_.at(arg);
     framework::VarDesc* var_desc = block->Var(var_name);
@@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
       auto* ori_desc = res->second;
       var_desc->SetPersistable(ori_desc->Persistable());
       var_desc->SetIsParameter(ori_desc->IsParameter());
+      has_refer_vars.insert(var_name);
     }
 
     auto cinn_tensor = GetCinnTensorOfVar(var_name);
@@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
     auto* ins = instructions.at(ins_idx).get();
     auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
     auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+    for (auto&& var_name : in_args) {
+      if (!has_refer_vars.count(var_name)) {
+        initialized_beforehand_vars_.emplace_back(var_name);
+      }
+    }
+    has_refer_vars.insert(out_args.begin(), out_args.end());
 
     auto* op_desc = block->AppendOp();
     op_desc->SetType("cinn_instruction_run");
@@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
                                                   framework::Scope* scope) {
   if (!parallel_executor_) {
     framework::details::ExecutionStrategy exec_strategy;
+    exec_strategy.num_threads_ = 1;
+    exec_strategy.use_device_ = platform::Place2DeviceType(place);
     framework::details::BuildStrategy build_strategy;
     parallel_executor_ = std::make_unique<ParallelExecutor>(
         place, scope, exec_strategy, build_strategy, runtime_graph_.get());
   }
 
   // update the scope bound to an OpHandle and rebuild temporary variables
+  VLOG(4) << "Reset scope and initialize temporary variables";
   std::unordered_map<Scope*, Scope*> scope_map = {
       {parallel_executor_->GetLocalScopes().front(), scope}};
   parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
   parallel_executor_->PrepareVariables(scope);
+  for (auto&& var_name : initialized_beforehand_vars_) {
+    auto* var = scope->GetVar(var_name);
+    auto* buffer = GetCinnBufferOfVar(var_name);
+    auto dim = framework::DDim(buffer->dims, buffer->dimensions);
+    var->GetMutable<LoDTensor>()->Resize(dim);
+    var->GetMutable<LoDTensor>()->mutable_data<float>(place);
+  }
   return parallel_executor_.get();
 }
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index a4d613ea618..ed5e4383d83 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -86,6 +86,11 @@ class CinnLaunchContext {
   void CheckTensorEquivalent(const std::string& var_name,
                              const framework::LoDTensor& paddle_tensor);
 
+  // Return the name list of variables skipped eager deletion
+  const std::vector<std::string>& GetSkipEagerVars() const {
+    return skip_eager_vars_;
+  }
+
   // Return internal variable names list
   const std::unordered_set<std::string>& GetInternalVarNames() const {
     return internal_var_names_;
@@ -143,6 +148,9 @@ class CinnLaunchContext {
   std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
+  // TODO(CtfGo): remove this list after fixing batch_norm bug
+  // due to duplicate association in the same variable.
+  std::vector<std::string> initialized_beforehand_vars_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
@@ -150,6 +158,8 @@ class CinnLaunchContext {
   std::unique_ptr<framework::ir::Graph> runtime_graph_;
   // a ParallelExecutor to execute the runtime graph
   std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+  // the name list of skip_eager_vars in runtime
+  std::vector<std::string> skip_eager_vars_;
 
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index cf3b98c6679..5263aae03ed 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     details::DebugCinnCompiledResult(cinn_compiled_object);
 
     auto* launch_context = cinn_compiled_object.launch_context.get();
-    // Step 3. Prepare arguments needed for the compiled executable program.
-    launch_context->UpdateCapturedEnv(scope, place);
+    // Step 3. check the computational consistency of the subgraph
+    //         before and after the compilation
     // 3.1 Input variables: tensors of input variables have
     //     been initialized before graph compiled, just check the
     //     equiality between tensors of paddle and cinn.
@@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                                             *inputs_name2tensor.at(var_name));
     }
 
-    // 3.2 Output variables: the output variables will be initialized
-    //     and allocated buffer in callbacks which are defined in the
-    //     external_malloc/free interface of cinn_buffer_t
-    //     in their corresponding arguments.
-    // 3.3 Internal variables: A temporary scope is created in
-    //     UpdateCapturedEnv to keep the internal variables and
-    //     they are also initialized through callbacks
-
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
-    // Step 5. Launch CINN to execute the compiled executable program
-    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
-    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
+    // Step 5. use PE to execute the compiled CINN instructions
+    //         in nodes of the runtime graph
+    VLOG(4) << "Execute the runtime graph by PE";
+    framework::Scope& exec_scope = scope.NewScope();
+    auto* pe = launch_context->InitializePE(place, &exec_scope);
+    pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
     VLOG(4) << "CinnLaunchOp launch execution done.";
   }
 };
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index f5b6161ff34..460d417e61f 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <mutex>
 #include <random>
 #include <string>
+#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
@@ -27,7 +28,9 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 
 USE_OP(cinn_launch);
+USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
+DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle::operators {
 
@@ -61,6 +64,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
     CompareOpResult<float>(scope.GetVar(test_op_out_name),
                            scope.GetVar(add_op_out_name));
   };
+  FLAGS_eager_delete_tensor_gb = -1;
 
   // CPU
   run_and_check_fn(platform::CPUPlace());
-- 
GitLab


From 1c2058834367464b4a293dbb58b6fa2137c24cc5 Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Thu, 3 Mar 2022 20:23:51 +0800
Subject: [PATCH 102/272] move eye, lerp infershape to phi (#40105)

---
 paddle/fluid/operators/eye_op.cc          | 26 ++++--------
 paddle/fluid/operators/lerp_op.cc         | 50 +++--------------------
 paddle/phi/infermeta/nullary.cc           |  8 ++++
 paddle/phi/infermeta/nullary.h            |  5 +++
 paddle/phi/infermeta/ternary.cc           | 17 ++++++++
 paddle/phi/infermeta/ternary.h            |  5 +++
 paddle/phi/kernels/eye_kernel.h           |  2 +-
 paddle/phi/kernels/funcs/common_shape.h   | 25 ++++++++++++
 paddle/phi/kernels/impl/eye_kernel_impl.h |  2 +-
 9 files changed, 75 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 8f8a0f174a7..f8c6b4eb8c5 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,24 +24,6 @@ class EyeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of EyeOP should not be null."));
-    auto num_rows = ctx->Attrs().Get<int64_t>("num_rows");
-    PADDLE_ENFORCE_EQ(
-        num_rows >= 0, true,
-        platform::errors::InvalidArgument(
-            "The value of Input(num_rows) should be non-negative int."));
-    auto num_columns = ctx->Attrs().Get<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-    PADDLE_ENFORCE_EQ(
-        num_columns >= 0, true,
-        platform::errors::InvalidArgument(
-            "The value of Input(num_columns) should be non-negative int."));
-    ctx->SetOutputDim("Out", {num_rows, num_columns});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -82,8 +67,11 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
+                            PT_INFER_META(phi::EyeInferMeta));
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    EyeInferShapeFunctor);
diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc
index 0aaefc7ca75..fef6fc5319e 100644
--- a/paddle/fluid/operators/lerp_op.cc
+++ b/paddle/fluid/operators/lerp_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,49 +23,6 @@ namespace operators {
 class LerpOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lerp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "lerp");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "lerp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "lerp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto w_dims = ctx->GetInputDim("Weight");
-    framework::DDim out_dims;
-    out_dims = GetOutputDims(x_dims, y_dims);
-    if (w_dims.size() > 1 || w_dims[0] != 1) {
-      out_dims = GetOutputDims(out_dims, w_dims);
-    }
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- private:
-  framework::DDim GetOutputDims(const framework::DDim& s_dims,
-                                const framework::DDim& l_dims) const {
-    if (s_dims.size() > l_dims.size()) {
-      return GetOutputDims(l_dims, s_dims);
-    }
-    std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
-    for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
-      int64_t s = s_dims[i];
-      int64_t l = l_dims[j];
-      if (s != l) {
-        if (l == 1) {
-          shapes[j] = s;
-        } else if (s != 1) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "The shape of tensor a %s:%d must match shape of tensor b "
-              "%s:%d.",
-              s_dims.to_str(), i, l_dims.to_str(), j));
-        }
-      }
-    }
-    return phi::make_ddim(shapes);
-  }
 };
 
 class LerpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -125,10 +85,12 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
+                            PT_INFER_META(phi::LerpInferMeta));
 REGISTER_OPERATOR(
     lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker,
     paddle::operators::LerpOpGradMaker<paddle::framework::OpDesc>,
     paddle::operators::LerpOpGradMaker<paddle::imperative::OpBase>,
-    paddle::operators::LerpInplaceInferer);
+    paddle::operators::LerpInplaceInferer, LerpInferShapeFunctor);
 
 REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp);
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 1fdf8a6940a..0c48c9d0c7e 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -32,4 +32,12 @@ void CreateInferMeta(const ScalarArray& shape,
   CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
 }
 
+void EyeInferMeta(int64_t num_rows,
+                  int64_t num_columns,
+                  DataType dtype,
+                  MetaTensor* out) {
+  if (num_columns == -1) num_columns = num_rows;
+  out->set_dims({num_rows, num_columns});
+  out->set_dtype(dtype);
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index ea5bb71551b..40d6ea595c0 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -35,4 +35,9 @@ void CreateInferMetaBase(const std::vector<int64_t>& shape,
 
 void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
 
+void EyeInferMeta(int64_t num_rows,
+                  int64_t num_columns,
+                  DataType dtype,
+                  MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 52aeaef8438..1c1497fb0e4 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -89,4 +89,21 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto w_dims = weight.dims();
+  DDim out_dims;
+  out_dims = funcs::GetOutputDims(x_dims, y_dims);
+  if (w_dims.size() > 1 || w_dims[0] != 1) {
+    out_dims = funcs::GetOutputDims(out_dims, w_dims);
+  }
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index d6223dd87aa..5679c5b533f 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -37,4 +37,9 @@ void AddmmInferMeta(const MetaTensor& input,
                     float beta,
                     MetaTensor* out);
 
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h
index 8b21b8ae405..e9e1abffd14 100644
--- a/paddle/phi/kernels/eye_kernel.h
+++ b/paddle/phi/kernels/eye_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void EyeKernel(const Context& ctx,
                int64_t num_rows,
                int64_t num_columns,
-               int dtype,
+               DataType dtype,
                DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index d5289dcc22c..dce80caab72 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -140,5 +140,30 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
   return true;
 }
 
+inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
+  if (s_dims.size() > l_dims.size()) {
+    return GetOutputDims(l_dims, s_dims);
+  }
+  std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
+  for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
+    int64_t s = s_dims[i];
+    int64_t l = l_dims[j];
+    if (s != l) {
+      if (l == 1) {
+        shapes[j] = s;
+      } else if (s != 1) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "The shape of tensor a %s:%d must match shape of tensor b "
+            "%s:%d.",
+            s_dims.to_str(),
+            i,
+            l_dims.to_str(),
+            j));
+      }
+    }
+  }
+  return phi::make_ddim(shapes);
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h
index 453652273a2..f4041f921fd 100644
--- a/paddle/phi/kernels/impl/eye_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eye_kernel_impl.h
@@ -36,7 +36,7 @@ template <typename T, typename Context>
 void EyeKernel(const Context& ctx,
                int64_t num_rows,
                int64_t num_columns,
-               int dtype,
+               DataType dtype,
                DenseTensor* out) {
   auto num = num_columns;
   if (num == -1) {
-- 
GitLab


From e7aea6507bd33eb8285fa822d1ea2fb4afb8d8af Mon Sep 17 00:00:00 2001
From: Ligoml <limengliu@tiaozhan.com>
Date: Thu, 3 Mar 2022 14:55:50 +0800
Subject: [PATCH 103/272] update README

---
 README.md    | 2 +-
 README_cn.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7dc83aa695c..cdbf2d9f3bf 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md)
 Welcome to the PaddlePaddle GitHub.
 
 PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 2.3 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
diff --git a/README_cn.md b/README_cn.md
index 6b37cfd97b3..3834ee148f9 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -15,7 +15,7 @@
 
 欢迎来到 PaddlePaddle GitHub
 
-飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者265万，服务企业10万家，基于飞桨开源深度学习平台产生了34万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
+飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者406万，服务企业15.7万家，基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
 
 ## 安装
 
-- 
GitLab


From eaacf8bfee5c9583f7ebf0deff20b90db9d73478 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 3 Mar 2022 21:31:40 +0800
Subject: [PATCH 104/272] fix save_vars bugs (#40062)

---
 paddle/fluid/operators/save_combine_op.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 6da73c99068..7fe6623dcca 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -38,7 +38,8 @@ class SaveCombineOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
-    return expected_kernel_type;
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place());
   }
 };
 
-- 
GitLab


From d50fb43e66c5c0c1a5f6b95229f39858ad07e7b5 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 4 Mar 2022 09:05:37 +0800
Subject: [PATCH 105/272] Move conv to pten (#39354)

* move conv to pten

* move conv to pten; test=develop

* fix bug;

* add conv cudnn impl; test=develop

* update

* update operator; test=develop

* fix bug; test=develop

* move operator and prepared_operator to develop; test=develop

* resolve conflict; test=develop

* remove useless code;test=develop

* add depency ; test=develop

* fix bug;

* add sig.cc ; test=develop

* fix use_op error; test=develop

* fix bug; test=develop

* fix bug; test=develop

* add conv3d register; test=develop

* fix star gan and conv_nn_grad test failed; test=develop

* add header; test=develop

* manul to recover to develop;

* resolve confilct; test=develop

* remove useless code

* fix bug;

* remove conv2d_cudnn; test=develop

* fix bugs; test=develop

* fix cpu rocm compile bugs; test=develop

* fix blas error; test=develop

* fix compile bug; test=develop

* fix windows compile error; test=develop

* fix windows error; test=develop

* resolve confilct; test=develop
---
 .../tensorrt/convert/test_conv2d_op.cc        |    2 +-
 paddle/fluid/operators/conv_cudnn_helper.h    |   29 +-
 paddle/fluid/operators/conv_cudnn_op.cu       | 1478 -----------------
 paddle/fluid/operators/conv_miopen_helper.h   |   21 +-
 paddle/fluid/operators/conv_op.cc             |   52 +-
 paddle/fluid/operators/conv_op.cu.cc          |   43 -
 paddle/fluid/operators/conv_op.h              |  813 ---------
 .../operators/conv_transpose_cudnn_op.cu      |   62 +-
 paddle/fluid/operators/conv_transpose_op.cu   |  140 ++
 paddle/fluid/operators/conv_transpose_op.h    |  126 --
 .../operators/fused/cudnn_norm_conv_test.cc   |    9 +-
 paddle/fluid/operators/math/depthwise_conv.h  |   72 -
 paddle/fluid/operators/math/vol2col.cc        |  227 +++
 .../operators/mkldnn/test_mkldnn_caching.cc   |    4 +-
 paddle/phi/core/compat/arg_map_context.h      |    1 +
 paddle/phi/kernels/CMakeLists.txt             |    2 +-
 paddle/phi/kernels/conv_grad_grad_kernel.h    |   61 +
 paddle/phi/kernels/conv_grad_kernel.h         |   73 +
 paddle/phi/kernels/conv_kernel.h              |   67 +
 .../phi/kernels/cpu/conv_grad_grad_kernel.cc  |   72 +
 paddle/phi/kernels/cpu/conv_grad_kernel.cc    |  103 ++
 paddle/phi/kernels/cpu/conv_kernel.cc         |   92 +
 paddle/phi/kernels/cpu/conv_util.h            |   91 +
 .../phi/kernels/depthwise_conv_grad_kernel.h  |   19 +
 paddle/phi/kernels/depthwise_conv_kernel.h    |   19 +
 paddle/phi/kernels/funcs/batch_norm_utils.h   |  143 ++
 paddle/phi/kernels/funcs/padding.h            |    4 +-
 .../phi/kernels/gpu/conv_grad_grad_kernel.cu  |   23 +
 paddle/phi/kernels/gpu/conv_grad_kernel.cu    |   62 +
 paddle/phi/kernels/gpu/conv_kernel.cu         |   56 +
 paddle/phi/kernels/gpu/conv_test_kernel.cu    |   13 +
 .../kernels/gpu/depthwise_conv.h}             |  798 ++++++---
 .../kernels/gpu/depthwise_conv_grad_kernel.cu |  142 ++
 .../phi/kernels/gpu/depthwise_conv_kernel.cu  |  130 ++
 .../gpudnn/conv_grad_grad_kernel_gpudnn.cu    |  834 ++++++++++
 .../kernels/gpudnn/conv_grad_kernel_gpudnn.cu |  683 ++++++++
 .../phi/kernels/gpudnn/conv_kernel_gpudnn.cu  |  476 ++++++
 paddle/phi/kernels/impl/conv_cudnn_impl.h     |   90 +
 .../kernels/impl/conv_grad_grad_kernel_impl.h |  330 ++++
 .../phi/kernels/impl/conv_grad_kernel_impl.h  |  257 +++
 paddle/phi/kernels/impl/conv_kernel_impl.h    |  183 ++
 paddle/phi/ops/compat/conv2d_sig.cc           |   70 +
 paddle/phi/ops/compat/conv3d_sig.cc           |   70 +
 paddle/phi/ops/compat/depthwise_conv2d_sig.cc |   77 +
 .../tests/unittests/test_conv1d_layer.py      |    1 +
 .../tests/unittests/test_conv2d_layer.py      |    2 +
 .../fluid/tests/unittests/test_conv2d_op.py   |    2 +-
 .../fluid/tests/unittests/test_conv3d_op.py   |    2 +
 .../tests/unittests/test_conv_nn_grad.py      |    6 +-
 .../tests/unittests/test_functional_conv2d.py |    1 +
 .../tests/unittests/test_functional_conv3d.py |    1 +
 .../test_fuse_relu_depthwise_conv_pass.py     |    1 +
 ...perative_star_gan_with_gradient_penalty.py |    1 +
 53 files changed, 5300 insertions(+), 2836 deletions(-)
 delete mode 100644 paddle/fluid/operators/conv_cudnn_op.cu
 delete mode 100644 paddle/fluid/operators/conv_op.cu.cc
 delete mode 100644 paddle/fluid/operators/math/depthwise_conv.h
 create mode 100644 paddle/phi/kernels/conv_grad_grad_kernel.h
 create mode 100644 paddle/phi/kernels/conv_grad_kernel.h
 create mode 100644 paddle/phi/kernels/conv_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/conv_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/conv_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/conv_util.h
 create mode 100644 paddle/phi/kernels/depthwise_conv_grad_kernel.h
 create mode 100644 paddle/phi/kernels/depthwise_conv_kernel.h
 create mode 100644 paddle/phi/kernels/funcs/batch_norm_utils.h
 create mode 100644 paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/conv_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/conv_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/conv_test_kernel.cu
 rename paddle/{fluid/operators/math/depthwise_conv.cu => phi/kernels/gpu/depthwise_conv.h} (62%)
 create mode 100644 paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
 create mode 100644 paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
 create mode 100644 paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
 create mode 100644 paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
 create mode 100644 paddle/phi/kernels/impl/conv_cudnn_impl.h
 create mode 100644 paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/conv_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/conv_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/conv2d_sig.cc
 create mode 100644 paddle/phi/ops/compat/conv3d_sig.cc
 create mode 100644 paddle/phi/ops/compat/depthwise_conv2d_sig.cc

diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index 95916746d6f..b96992ef851 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP(conv2d_transpose);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 3bbb284ca82..4e6fda3d09a 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                                const Tensor* input, Tensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *context.eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
   auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
 
 using framework::ConvSearchCache;
 
-static void SetConvMathType(const framework::ExecutionContext& ctx,
-                            cudnnDataType_t dtype,
+static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
                             const platform::ConvolutionDescriptor& cdesc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto& dev_ctx = ctx;
   if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
     bool has_got_workspace_size = true;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     } else if (deterministic) {
       algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     } else if (deterministic) {
       return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     platform::CUDAGraphCaptureModeGuard guard;
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     } else if (deterministic) {
       return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
       AlgorithmsCache<algo_t>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetBackwardFilter());
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
deleted file mode 100644
index 2055bf560e6..00000000000
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ /dev/null
@@ -1,1478 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the spopecific language governing permissions and
-limitations under the License. */
-
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/operators/conv_miopen_helper.h"
-#else
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-
-static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) {
-  return dev_ctx.GetComputeCapability() >= 70;
-}
-
-template <typename T>
-class CUDNNConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    // Tensor Core introduced from Volta GPUs supports more faster conv op
-    // with FP16 in NHWC data format.
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    // We will only do data format conversion from NHWC to NCHW.
-    // cudnn will convert NCHW to NHWC automatically on Tensor Core.
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-
-    // ------------ transformed tensor -----------
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_filter_channel(filter->type());
-    T* output_data = nullptr;
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, output,
-                                                           &transformed_output);
-
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output.ShareDataWith(*output);
-    }
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-    }
-    output_data = transformed_output.data<T>();
-
-    // update padding and dilation
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-
-    Tensor transformed_input;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-
-      std::vector<int> input_pad(transformed_input_channel.dims().size() * 2,
-                                 0);
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_input,
-                  &transformed_filter_channel,
-                  &transformed_output,
-                  strides,
-                  padding_common,
-                  dilations,
-                  dtype};
-
-    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_format = GetCudnnTensorFormat(layout);
-
-    args.handle = handle;
-
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN need to set groups in cdesc in miopen_desc.h
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn(), groups);
-#else
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn());
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it manually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
-        args.cdesc.desc(), groups));
-    groups = 1;
-#endif
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN do not set groups in wdesc after set groups in cdesc
-    groups = 1;
-#endif
-    args.idesc.set(transformed_input, layout_format);
-    args.wdesc.set(transformed_filter_channel, layout_format, groups);
-    args.odesc.set(transformed_output, layout_format);
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    }
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size = 0;  // final workspace to allocate.
-// ------------------- cudnn conv algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t algo{};
-    using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-    workspace_size = search::GetWorkspaceSize(args);
-    algo = search::Find<T>(args, exhaustive_search, deterministic,
-                           workspace_size, ctx);
-#else
-    cudnnConvolutionFwdAlgo_t algo{};
-    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-    workspace_size = search::GetWorkspaceSize(args, algo);
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
-    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
-    // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
-    // FWD_ALGO_IMPLICIT_GEMM manually.
-    if (ctx.Attr<int>("groups") > 1) {
-      algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    }
-#endif
-
-    // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
-
-#ifdef PADDLE_WITH_HIP
-    workspace_handle.RunFunc(
-        [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionForward(
-                  handle, &alpha, args.idesc.desc(), input_data,
-                  args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
-                  &beta, args.odesc.desc(), output_data, workspace_ptr,
-                  workspace_size));
-        },
-        workspace_size);
-#else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::cudnnConvolutionForward(
-                    handle, &alpha, args.idesc.desc(),
-                    input_data + i * group_offset_in, args.wdesc.desc(),
-                    filter_data + i * group_offset_filter, args.cdesc.desc(),
-                    algo, workspace_ptr, workspace_size, &beta,
-                    args.odesc.desc(), output_data + i * group_offset_out));
-          },
-          workspace_size);
-    }
-#endif
-
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_output, output);
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvGradOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-
-    // transform Tensor
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output_grad_channel(output_grad->type());
-    Tensor transformed_input_grad_channel(input->type());
-    Tensor transformed_filter_channel(filter->type());
-    Tensor transformed_filter_grad_channel(filter->type());
-
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
-                 "NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-
-      if (input_grad) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, input_grad, &transformed_input_grad_channel);
-        // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
-        // the data of input_grad to transformed_input_grad_channel.
-        if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-          TransToChannelFirst<platform::CUDADeviceContext, T>(
-              ctx, input_grad, &transformed_input_grad_channel);
-        }
-      }
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output_grad_channel.ShareDataWith(*output_grad);
-      if (input_grad) {
-        transformed_input_grad_channel.ShareDataWith(*input_grad);
-      }
-    }
-
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-
-      if (filter_grad) {
-        ResizeToChannelLast<platform::CUDADeviceContext, T>(
-            ctx, filter_grad, &transformed_filter_grad_channel);
-      }
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-      if (filter_grad) {
-        transformed_filter_grad_channel.ShareDataWith(*filter_grad);
-      }
-    }
-
-    //  update paddings
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // cuDNN only supports padding the same amount on every dimension.
-    // So we create a new padded input tensor.
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_input(input->type());
-    Tensor transformed_input_grad(input->type());
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-
-      transformed_input_grad.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (input_grad) {
-        transformed_input_grad =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      // pad for input
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (input_grad) {
-        transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
-      }
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    const T* output_grad_data = transformed_output_grad_channel.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-    T* filter_grad_data = nullptr;
-    T* input_grad_data = nullptr;
-    T* transformed_input_grad_data = nullptr;
-
-    ConvArgs args1{&transformed_input_grad,
-                   &transformed_filter_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_input,
-                   &transformed_filter_grad_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-
-    auto handle = dev_ctx.cudnn_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    }
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-// ------------------- cudnn backward algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-    // input data workspace_size
-    size_t workspace_size_d = 0;
-    // weight workspace_size
-    size_t workspace_size_w = 0;
-    int iwo_groups = groups;
-    int c_groups = 1;
-
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    if (input_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      input_grad_data = input_grad->data<T>();
-      transformed_input_grad_data = transformed_input_grad.data<T>();
-      args1.handle = handle;
-      args1.idesc.set(transformed_input_grad, layout_tensor);
-      args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
-      args1.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args1.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-
-#ifdef PADDLE_WITH_HIP
-      using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size_d =
-          std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
-      data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
-                                   workspace_size_d, ctx);
-#else
-      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-      workspace_size_d = std::max(workspace_size_d,
-                                  search1::GetWorkspaceSize(args1, data_algo));
-#endif
-    }
-
-    if (filter_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      filter_grad_data = transformed_filter_grad_channel.data<T>();
-      args2.handle = handle;
-      args2.idesc.set(transformed_input, layout_tensor);
-      args2.wdesc.set(transformed_filter_grad_channel, layout_tensor,
-                      iwo_groups);
-      args2.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args2.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size_w =
-          std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
-      filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
-                                     workspace_size_w, ctx);
-#else
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
-      workspace_size_w = std::max(
-          workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
-#endif
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN ONLY support beta to be 0.0f
-    ScalingParamType<T> beta = 0.0f;
-#else
-    ScalingParamType<T> beta =
-        (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
-#endif
-    VLOG(4) << "Conv_grad: use_addto = "
-            << (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
-
-    if (input_grad) {
-// When beta is 0, it is unnecessary to reset input_grad.
-// When beta is 1, the output cannot be reset since addt strategy used.
-#ifdef PADDLE_WITH_HIP
-      if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-        Tensor temp_tensor(transformed_input_grad.type());
-        temp_tensor.Resize(transformed_input_grad.dims());
-        T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
-                      cudnn_workspace_ptr, workspace_size_d));
-            },
-            workspace_size_d);
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
-            handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
-            transformed_input_grad_data, &alpha, args1.idesc.desc(),
-            temp_tensor_data, &beta, args1.idesc.desc(),
-            transformed_input_grad_data));
-      } else {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data, cudnn_workspace_ptr,
-                      workspace_size_d));
-            },
-            workspace_size_d);
-      }
-
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args1.wdesc.desc(),
-                      filter_data + i * group_offset_filter, args1.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
-                      workspace_size_d, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data + i * group_offset_in));
-            },
-            workspace_size_d);
-      }
-#endif
-      if (!is_sys_pad) {
-        std::vector<int> starts(transformed_input_channel.dims().size(), 0);
-        std::vector<int> axes(transformed_input_channel.dims().size(), 0);
-
-        for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-
-        transformed_input_grad_channel.mutable_data(ctx.GetPlace());
-        if (transformed_input_channel.dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        }
-      }
-
-      if (channel_last && compute_format == DataLayout::kNCHW) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_input_grad_channel, input_grad);
-      }
-    }
-
-    // filter_grad do not use inplace addto.
-    ScalingParamType<T> beta_filter = 0.0f;
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-// Because beta is zero, it is unnecessary to reset filter_grad.
-#ifdef PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args2.odesc.desc(), output_grad_data,
-                    args2.idesc.desc(), input_data, args2.cdesc.desc(),
-                    filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
-                    cudnn_workspace_ptr, workspace_size_w));
-          },
-          workspace_size_w);
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args2.idesc.desc(),
-                      input_data + i * group_offset_in, args2.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size_w, &beta_filter, args2.wdesc.desc(),
-                      filter_grad_data + i * group_offset_filter));
-            },
-            workspace_size_w);
-      }
-#endif
-
-      if (compute_format == DataLayout::kNHWC) {
-        TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_filter_grad_channel, filter_grad);
-      }
-    }
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv(ddI, W) + conv(I, ddW)
- * dW = conv_bp_filter(ddI, dO)
- * dI = conv_bp_data(ddW, dO)
- */
-template <typename T>
-class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-    if (ddO) {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-      set_zero(dev_ctx, ddO, static_cast<T>(0));
-    }
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // const T* x = X->data<T>();
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-    T* transformed_dx = nullptr;
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensors to channel first-----------
-    Tensor transformed_X_channel(X->type());
-    Tensor transformed_dO_channel(dO->type());
-    Tensor transformed_ddX_channel(X->type());
-
-    Tensor transformed_ddO_channel(dO->type());
-    Tensor transformed_dX_channel(X->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-
-      if (ddX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-        TransToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-      }
-
-      if (ddO) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddO, &transformed_ddO_channel);
-      }
-      if (dX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, dX, &transformed_dX_channel);
-        transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
-      }
-
-    } else {
-      transformed_X_channel = *X;
-      transformed_dO_channel = *dO;
-      if (ddX) {
-        transformed_ddX_channel = *ddX;
-      }
-      if (ddO) {
-        transformed_ddO_channel.ShareDataWith(*ddO);
-      }
-      if (dX) {
-        transformed_dX_channel.ShareDataWith(*dX);
-      }
-    }
-
-    auto in_dims = transformed_X_channel.dims();
-    auto filter_dims = W->dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_X(X->type());
-    Tensor transformed_ddX(X->type());
-
-    Tensor transformed_dX(X->type());
-
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(X->dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_X_channel.dims()[0];
-      new_input_shape_vec[1] = transformed_X_channel.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            transformed_X_channel.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_X.Resize(new_input_shape);
-      transformed_ddX.Resize(new_input_shape);
-      transformed_dX.Resize(new_input_shape);
-
-      transformed_X =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (ddX) {
-        transformed_ddX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      if (dX) {
-        transformed_dX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-
-      // pad for input
-      const int rank = X->dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_X.ShareDataWith(transformed_X_channel);
-      if (ddX) {
-        transformed_ddX.ShareDataWith(transformed_ddX_channel);
-      }
-      if (dX) {
-        transformed_dX.ShareDataWith(transformed_dX_channel);
-      }
-
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* x = transformed_X.data<T>();
-
-    int iwo_group = groups;
-    int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-    groups = 1;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    ConvArgs args1{&transformed_ddX,
-                   W,
-                   &transformed_ddO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{
-        &transformed_X, ddW,  &transformed_ddO_channel, strides, padding_common,
-        dilations,      dtype};
-    ConvArgs args3{&transformed_ddX,
-                   dW,
-                   &transformed_dO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args4{
-        &transformed_dX, ddW,  &transformed_dO_channel, strides, padding_common,
-        dilations,       dtype};
-
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t fwd_algo1 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvFwdAlgorithm_t fwd_algo2 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionFwdAlgo_t fwd_algo1 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t fwd_algo2 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-
-    auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
-
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-
-    T* transformed_ddy_channel = nullptr;
-    if (ddO) {
-      ddy = ddO->data<T>();
-      transformed_ddy_channel = transformed_ddO_channel.data<T>();
-      if (ddX) {
-        args1.handle = handle;
-        args1.idesc.set(transformed_ddX, iwo_group);
-        args1.wdesc.set(*W, layout, iwo_group);
-        args1.odesc.set(transformed_ddO_channel, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-        using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size = search1::GetWorkspaceSize(args1);
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
-        workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
-#endif
-      }
-
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(transformed_X, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(transformed_ddO_channel, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-        using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size =
-            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, fwd_algo2));
-#endif
-      }
-    }
-
-    if (dW && ddX) {
-      dw = dW->data<T>();
-      args3.handle = handle;
-      args3.idesc.set(transformed_ddX, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-      args3.odesc.set(transformed_dO_channel, iwo_group);
-      args3.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
-                                     workspace_size, ctx);
-#else
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-#endif
-    }
-
-    if (ddW && dX) {
-      transformed_dx = transformed_dX.data<T>();
-
-      args4.handle = handle;
-      args4.idesc.set(transformed_dX, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(transformed_dO_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-      using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
-                                   workspace_size, ctx);
-#else
-      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-#endif
-    }
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h,
-             &i_w);
-
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-             &o_h, &o_w);
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = W->numel() / groups;
-
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
-    // 0.0f;
-    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    if (ddO) {
-      if (ddX) {
-        ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args1.idesc.desc(), ddx,
-                      args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
-                      &beta, args1.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args1.idesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        fwd_algo1, workspace_ptr, workspace_size, &beta,
-                        args1.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (ddW) {
-#ifdef PADDLE_WITH_HIP
-        // MIOPEN ONLY support beta to be 0.0f
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
-                      ddw, args2.cdesc.desc(), fwd_algo2, &beta,
-                      args2.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args2.idesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        fwd_algo2, workspace_ptr, workspace_size, &alpha,
-                        args2.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_ddO_channel, ddO);
-      }
-    }
-    T* transformed_dy_channel = transformed_dO_channel.data<T>();
-    if (dW && ddX) {
-      ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
-                    args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
-                    &beta, args3.wdesc.desc(), dw, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args3.idesc.desc(),
-                      ddx + i * group_offset_in, args3.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.cdesc.desc(), filter_algo, workspace_ptr,
-                      workspace_size, &beta, args3.wdesc.desc(),
-                      dw + i * group_offset_filter));
-            },
-            workspace_size);
-      }
-#endif
-    }
-
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardData(
-                    handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
-                    args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
-                    &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args4.wdesc.desc(),
-                      ddw + i * group_offset_filter, args4.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.cdesc.desc(), data_algo, workspace_ptr,
-                      workspace_size, &beta, args4.idesc.desc(),
-                      transformed_dx + i * group_offset_in));
-            },
-            workspace_size);
-      }
-#endif
-
-      if (!is_sys_pad) {
-        // reverse padded input
-        std::vector<int> starts(X->dims().size(), 0);
-        std::vector<int> axes(X->dims().size(), 0);
-
-        for (size_t i = 0; i < X->dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-        if (X->dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_dX_channel, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
-// Use depthwise_conv2d in MIOPEN to resolve this issue
-REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#else
-#if CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-#else
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 9c9795143eb..66f71869384 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                                const Tensor* input, Tensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *context.eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
   auto offsets = Eigen::array<int, D>();
@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index e345a4d2603..8213e877f72 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
             paddle::framework::DataTypeToString(input_data_type),
             paddle::framework::DataTypeToString(filter_data_type)));
   }
-#ifndef PADDLE_WITH_ASCEND_CL
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(
-        library, framework::LibraryType::kCUDNN,
-        platform::errors::InvalidArgument(
-            "float16 can only be used when CUDNN or NPU is used"));
-  }
-#endif
+// #ifndef PADDLE_WITH_ASCEND_CL
+//   if (input_data_type == framework::proto::VarType::FP16) {
+//     PADDLE_ENFORCE_EQ(
+//         library, framework::LibraryType::kCUDNN,
+//         platform::errors::InvalidArgument(
+//             "float16 can only be used when CUDNN or NPU is used"));
+//   }
+// #endif
 #if PADDLE_WITH_CUDA
   if (input_data_type == framework::proto::VarType::BF16 &&
       library == framework::LibraryType::kCUDNN) {
@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
                   ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
 
-// depthwise conv kernel
-// TODO(xingzhaolong): neon kernel for mobile
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
 REGISTER_OP_VERSION(conv2d)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc
deleted file mode 100644
index d07593f5c02..00000000000
--- a/paddle/fluid/operators/conv_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 26166362da8..a5d888765bf 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename DeviceContext, typename T>
-class GemmConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    const int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output(output->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-
-    // update padding and dilation
-    auto trans_in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims =
-        phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-    // filter_shape_vec:
-    // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-
-    // output_shape_vec:
-    // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output.dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec:
-    // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
-    // o_d,o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = trans_in_dims[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size:
-    // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
-    // o_w)
-
-    framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim);
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    framework::DDim in_matrix_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        transformed_output.dims()[1],
-        transformed_output.numel() /
-            (transformed_output.dims()[0] * transformed_output.dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
-
-    math::Vol2ColFunctor<DeviceContext, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch =
-          transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
-      Tensor out_batch =
-          transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx, in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &col);
-
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
-                    T(0.0));
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output_grad(output_grad->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-
-    // update padding and dilation
-    auto in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output_grad.dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = transformed_input.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w)
-    // or
-    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    framework::DDim input_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        transformed_output_grad.dims()[1],
-        transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
-                                           transformed_output_grad.dims()[1])};
-
-    // convolution backward input operator:  gemm + col2im(or col2vol)
-    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary,
-      // because math::matmul will reset input_grad.
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col_matrix.ShareDataWith(in_grad_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
-                      &col_matrix, T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &in_grad_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col.ShareDataWith(in_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, in_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
-                      &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
-    const Tensor* X = ctx.Input<Tensor>("Input");
-    const Tensor* dY = ctx.Input<Tensor>("DOutput");
-    const Tensor* ddX = ctx.Input<Tensor>("DDInput");
-    const Tensor* ddW_in = ctx.Input<Tensor>("DDFilter");
-
-    Tensor* ddY = ctx.Output<Tensor>("DDOutput");
-    Tensor* dW = ctx.Output<Tensor>("DFilter");
-    Tensor* dX = ctx.Output<Tensor>("DInput");
-    Tensor W = GET_DATA_SAFELY(ctx.Input<Tensor>("Filter"), "Input", "Filter",
-                               "GemmConvDoubleGrad");
-    if (!ddY && !dW && !dX) return;
-
-    const int groups = ctx.Attr<int>("groups");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensor
-    Tensor transformed_X(X->dtype());
-    Tensor transformed_dY(dY->dtype());
-    Tensor transformed_ddX(X->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-      TransToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-
-      ResizeToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-      TransToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-
-      if (ddX) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-        TransToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-      }
-    } else {
-      transformed_X = *X;
-      transformed_dY = *dY;
-      if (ddX) {
-        transformed_ddX = *ddX;
-      }
-    }
-
-    // update padding and dilation
-    auto in_dims = transformed_X.dims();
-    auto filter_dims = W.dims();
-
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(W.dims()));
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_dY.dims()));
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    // col_shape [in_channel/group, kh, kw, oh, ow]
-    col_shape_vec[0] = transformed_X.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-    // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-    // input_shape [Cin, H, W]
-    framework::DDim input_shape =
-        phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
-    // filter_matrix_shape [Cout, Cin * kh * kw]
-    framework::DDim filter_matrix_shape = {W.dims()[0],
-                                           W.numel() / W.dims()[0]};
-
-    W.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        transformed_dY.dims()[1],
-        transformed_dY.numel() /
-            (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
-    int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    // dx convolution double grad:  gemm + col2im(col2vol)
-    // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-    // oH, oW)
-    if (dX && ddW_in) {
-      Tensor ddW;
-      ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-      dX->mutable_data<T>(ctx.GetPlace());
-
-      Tensor transformed_dX(dX->dtype());
-
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, dX, &transformed_dX);
-
-      } else {
-        transformed_dX = *dX;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary
-      // because math::matmul will reset dx
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col_matrix.ShareDataWith(dx_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix,
-                      T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &dx_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_dX, dX);
-      }
-    }
-
-    // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-    // oH, oW)
-    // dw convolution double grad:  im2col(vol2col) + gemm
-    if (dW && ddX) {
-      dW->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dW, static_cast<T>(0));
-      Tensor dW_arr = *dW;
-      dW_arr.Resize(filter_matrix_shape);
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; ++g) {
-          // im2col
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, ddx_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-
-          Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice,
-                      T(1.0));
-        }
-      }
-    }
-
-    // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-    // w/ddw(Cout, Cin, kh, kw)
-    // ddy convolution double grad: im2col(vol2col) + gemm
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-
-      Tensor transformed_ddY(ddY->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddY, &transformed_ddY);
-      } else {
-        transformed_ddY = *ddY;
-      }
-
-      set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor ddy_batch =
-            transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
-        for (int g = 0; g < groups; ++g) {
-          // gemm
-          Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-
-          if (ddX) {
-            Tensor ddx_batch =
-                transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-            Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-            if (!is_expand) {
-              col.ShareDataWith(ddx_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, ddx_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-            }
-            Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(0.0));
-          }
-
-          if (ddW_in) {
-            Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape);
-            Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-
-            Tensor ddW;
-            ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-            if (!is_expand) {
-              col.ShareDataWith(x_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, x_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-            }
-
-            // gemm
-            Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(1.0));
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_ddY, ddY);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    if (channel_last) {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[output->dims().size() - 1] %
-              input->dims()[input->dims().size() - 1],
-          0, platform::errors::InvalidArgument(
-                 "ShapeError: The output channels must be a multiple of the "
-                 "input channels. But receivced output channel number is %d "
-                 "and input channel number is %d",
-                 output->dims()[output->dims().size() - 1],
-                 input->dims()[input->dims().size() - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[1] % input->dims()[1], 0,
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be a multiple of the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[1], input->dims()[1]));
-    }
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (fuse_relu) {
-      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    } else {
-      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-      if (fuse_relu) {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      } else {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      if (fuse_relu) {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      } else {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 141a99f60f1..1841b78af32 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-    algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic, workspace_size,
+        ctx.template device_context<platform::CUDADeviceContext>());
 #else
     using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(args, false, deterministic, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic,
+        ctx.template device_context<platform::CUDADeviceContext>());
     workspace_size =
         std::max(workspace_size, search::GetWorkspaceSize(args, algo));
 #endif
@@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1));
-      data_algo =
-          search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(args1, false, deterministic, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
 #endif
@@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      filter_algo =
-          search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size = std::max(workspace_size,
                                 search2::GetWorkspaceSize(args2, filter_algo));
 #endif
@@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size = search1::GetWorkspaceSize(args1);
-        bwd_algo1 =
-            search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
         using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
         workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
 #endif
       }
@@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size =
             std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        bwd_algo2 =
-            search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
         using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
         workspace_size = std::max(workspace_size,
                                   search2::GetWorkspaceSize(args2, bwd_algo2));
 #endif
@@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo =
-          search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size = std::max(workspace_size,
                                 search3::GetWorkspaceSize(args3, filter_algo));
 #endif
@@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo =
-          search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search4::Find<T>(args4, false, deterministic, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
 #endif
diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu
index b2a4910222f..054cb4b3389 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu
+++ b/paddle/fluid/operators/conv_transpose_op.cu
@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
 
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(
+        groups, filter.dims()[0],
+        platform::errors::InvalidArgument(
+            "groups should be error to the 1st dimension of filter. But "
+            "received groups is %d and filter dimension[0] is %d",
+            groups, filter.dims()[0]));
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
+                                  "dilations should be 1 in depthwise conv. "
+                                  "But received dilations is %d",
+                                  v));
+    }
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *output, filter, *input, strides,
+        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+        dilations, output, data_layout);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
+      depthwiseConv(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, filter, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, input_grad, data_layout);
+    }
+
+    if (filter_grad) {
+      phi::funcs::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, *input, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, filter_grad, data_layout);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 // conv2d
 REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
                         ops::GemmConvTransposeKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 76d6ad6bf2f..ee0fb7ab368 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(
-        groups, filter.dims()[0],
-        platform::errors::InvalidArgument(
-            "groups should be error to the 1st dimension of filter. But "
-            "received groups is %d and filter dimension[0] is %d",
-            groups, filter.dims()[0]));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
-                                  "dilations should be 1 in depthwise conv. "
-                                  "But received dilations is %d",
-                                  v));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(
-        dev_ctx, *output, filter, *input, strides,
-        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-        dilations, output, data_layout);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    if (input_grad) {
-      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(
-          dev_ctx, *output_grad, filter, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, input_grad, data_layout);
-    }
-
-    if (filter_grad) {
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-
-      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(
-          dev_ctx, *output_grad, *input, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, filter_grad, data_layout);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 1864bdbb866..b3792a176fa 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,10 +30,10 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(conv2d);
-USE_OP(conv2d_grad);
-USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
-USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN);
+USE_OP_ITSELF(conv2d);
+USE_OP_ITSELF(conv2d_grad);
+PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT);
+PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
 
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
deleted file mode 100644
index e41f0aedf39..00000000000
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using DataLayout = framework::DataLayout;
-
-/*
- * \brief Compute the depthwise convolution which include
- * forward process and backpropagation process
- */
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvInputGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* input_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFilterGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 42bf1f471de..bc5a589ed6f 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace platform {
 class CPUDeviceContext;
@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <class T>
+class Vol2ColFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    // changed
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx;
+            if (data_layout != DataLayout::kNHWC) {
+              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                            input_width +
+                        w_pad;
+            } else {
+              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                            input_channels +
+                        c_in;
+            }
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
 /*
  * vol = [input_channels,input_depth, input_height, input_width]
  * col =
@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <class T>
+class Col2VolFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx;
+              if (data_layout != DataLayout::kNHWC) {
+                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                              input_width +
+                          w_pad;
+              } else {
+                vol_idx =
+                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                        input_channels +
+                    cIm;
+              }
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
 template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Vol2ColFunctor<phi::CPUContext, float>;
+template class Vol2ColFunctor<phi::CPUContext, double>;
+
 template class Col2VolFunctor<platform::CPUDeviceContext, float>;
 template class Col2VolFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<phi::CPUContext, float>;
+template class Col2VolFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 2fdeecf8934..05cd264cf3e 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -33,7 +33,7 @@ USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
 
 namespace paddle {
@@ -55,7 +55,7 @@ class CacheTester {
     onednn_dev_ctx_->ResetBlobMap(nullptr);
   }
 
-  bool Analyze(unsigned short int num_entries) {
+  bool Analyze(uint16_t num_entries) {
     //  Number of created objects in cache should be as expected (num_entries)
     return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries;
   }
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index f625d57df2e..688a0e54a0c 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <tuple>
 
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ef51d6daf6a..4ffa1826a29 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
new file mode 100644
index 00000000000..339f1c00eaa
--- /dev/null
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& paddding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& dev_ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::string& paddding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h
new file mode 100644
index 00000000000..bad30989ac9
--- /dev/null
+++ b/paddle/phi/kernels/conv_grad_kernel.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings,
+                    const std::string& paddding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h
new file mode 100644
index 00000000000..eb0bfdd0275
--- /dev/null
+++ b/paddle/phi/kernels/conv_kernel.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& paddding_algorithm,
+                int groups,
+                const std::vector<int>& dilations,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& paddding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
new file mode 100644
index 00000000000..f157bb017f8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings_t,
+                          const std::string& padding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations_t,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search_t,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad) {
+  ConvGradGradKernel<T>(ctx,
+                        input_grad_grad,
+                        filter_grad_grad,
+                        out_grad,
+                        input,
+                        filter,
+                        strides,
+                        paddings_t,
+                        padding_algorithm,
+                        groups,
+                        dilations_t,
+                        data_format,
+                        use_addto,
+                        workspace_size_MB,
+                        exhaustive_search_t,
+                        out_grad_grad,
+                        input_grad,
+                        filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3DGradGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
new file mode 100644
index 00000000000..994ad861bd1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(
+    conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc
new file mode 100644
index 00000000000..e0b4ee7d577
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
new file mode 100644
index 00000000000..d26d89086b2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+template <typename T = int>
+inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
+                                     std::vector<T>* dilation,
+                                     const std::string padding_algorithm,
+                                     const DDim data_dims,
+                                     const std::vector<T>& strides,
+                                     const std::vector<T>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2,
+        paddings->size(),
+        phi::errors::InvalidArgument(
+            "Attribute padding's size should be the same or twice as the "
+            "input's dimension. "
+            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "dimension is %d, input's shape is [%s].",
+            paddings->size(),
+            make_ddim(*paddings),
+            data_dims.size(),
+            data_dims));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+
+      // dilation
+      *(dilation->begin() + i) = 1;
+    }
+
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  if (paddings.size() != strides.size()) {
+    for (size_t j = 0; j < paddings.size(); ++j) {
+      padding_0 = padding_0 && (paddings[j] == 0);
+    }
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/depthwise_conv_grad_kernel.h b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
new file mode 100644
index 00000000000..b5eff76e90c
--- /dev/null
+++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {}  // namespace phi
diff --git a/paddle/phi/kernels/depthwise_conv_kernel.h b/paddle/phi/kernels/depthwise_conv_kernel.h
new file mode 100644
index 00000000000..b5eff76e90c
--- /dev/null
+++ b/paddle/phi/kernels/depthwise_conv_kernel.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
new file mode 100644
index 00000000000..21ebae8487f
--- /dev/null
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h
index 6d10ff2dfcf..e2c4e766b60 100644
--- a/paddle/phi/kernels/funcs/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
new file mode 100644
index 00000000000..6449a193a08
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
new file mode 100644
index 00000000000..4df7bb26adf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(
+    conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu
new file mode 100644
index 00000000000..680ee4426af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+
+PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/conv_test_kernel.cu b/paddle/phi/kernels/gpu/conv_test_kernel.cu
new file mode 100644
index 00000000000..0544a1e298b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu
@@ -0,0 +1,13 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/phi/kernels/gpu/depthwise_conv.h
similarity index 62%
rename from paddle/fluid/operators/math/depthwise_conv.cu
rename to paddle/phi/kernels/gpu/depthwise_conv.h
index a4665a8f9a6..5270a4b2fdb 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
+#pragma once
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -21,7 +25,7 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/math/depthwise_conv.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -30,6 +34,58 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+using DataLayout = framework::DataLayout;
+
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* input_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* filter_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
 template <typename T>
 static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) {
   typedef cub::WarpReduce<T> WarpReduce;
@@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
@@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   }
   if (c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
+      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(input_data,
+                                                        filter_data,
+                                                        batch_size,
+                                                        output_channels,
+                                                        output_height,
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
     } else {
-      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
+      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(input_data,
+                                                        filter_data,
+                                                        batch_size,
+                                                        output_channels,
+                                                        output_height,
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           output_data);
     } else {
       KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           output_data);
     }
   }
@@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   int final_filter_multiplier = filter_multiplier;
@@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp(
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, final_filter_multiplier, filter_height,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     } else {
       KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, final_filter_multiplier, filter_height,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNCHW<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                               fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, c_filter_multiplier, filter_height,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     } else {
-      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNHWC<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                               fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, c_filter_multiplier, filter_height,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     }
   }
 }
@@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp(
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   T s = 0;
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
 
@@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
 
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   int bid = blockIdx.z;
   int image_h = blockIdx.y;
   int kernel_iw = blockIdx.x % filter_width;
@@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
 
 template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   const int bid = blockIdx.z;
   int image_h = blockIdx.x * dilate_height + blockIdx.y;
   if (image_h >= output_height) {
@@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
-__global__ void KernelDepthwiseConvFilterGradSp(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
+__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
+                                                const T* input_data,
+                                                const int num,
+                                                const int output_channels,
+                                                const int output_height,
+                                                const int output_width,
+                                                const int input_channels,
+                                                const int input_height,
+                                                const int input_width,
+                                                const int filter_multiplier,
+                                                const int filter_height,
+                                                const int filter_width,
+                                                const int stride_height,
+                                                const int stride_width,
+                                                const int padding_height,
+                                                const int padding_width,
+                                                const int dilate_height,
+                                                const int dilate_width,
+                                                T* filter_grad_data) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
   int w_stride = stride_width;
@@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp(
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     } else {
       KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     } else {
-      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+      KernelDepthwiseConvFilterGradCFilterNHWC<T,
+                                               c_filter,
                                                fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     }
   }
@@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp(
  * height and width, respectively.
  */
 template <class T, bool fuse_relu_before_conv>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
-                           fuse_relu_before_conv> {
+class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
                   const DataLayout data_layout = DataLayout::kNCHW) {
     const int batch_size = input.dims()[0];
     const int input_channels =
@@ -905,12 +1199,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 
     framework::Tensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
-                                       filter.dims()[0], filter.dims()[1]});
+      framework::DDim filter_hwc_dims({filter.dims()[2],
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
           ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
       threads = dim3(std::min(output_channels, thread), blocks, 1);
       grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
     int nums_output =
@@ -952,37 +1249,73 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 #endif
     int grid_size = (nums_output + block_size - 1) / block_size;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
-  if (c_filter_multiplier == 0 ||                                              \
-      filter_multiplier == c_filter_multiplier &&                              \
-          stride_height == stride_width && stride_height == c_stride &&        \
-          (ksize_height == ksize_width && ksize_height == c_filter ||          \
-           c_filter == -1)) {                                                  \
-    if (c_filter == -1) {                                                      \
-      threads.x = block_size;                                                  \
-      grid.x = grid_size;                                                      \
-      threads.y = threads.z = grid.y = grid.z = 1;                             \
-    }                                                                          \
-    if (data_layout != DataLayout::kNHWC) {                                    \
-      KernelDepthwiseConvSp<                                                   \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          input_data, filter_data, batch_size, output_channels, output_height, \
-          output_width, input_channels, input_height, input_width,             \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
-          stride_width, padding_height, padding_width, dilate_height,          \
-          dilate_width, output_data);                                          \
-    } else {                                                                   \
-      KernelDepthwiseConvSp<                                                   \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          input_data, filter_data, batch_size, output_channels, output_height, \
-          output_width, input_channels, input_height, input_width,             \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
-          stride_width, padding_height, padding_width, dilate_height,          \
-          dilate_width, output_data);                                          \
-    }                                                                          \
-    return;                                                                    \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
+  if (c_filter_multiplier == 0 ||                                         \
+      filter_multiplier == c_filter_multiplier &&                         \
+          stride_height == stride_width && stride_height == c_stride &&   \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
+           c_filter == -1)) {                                             \
+    if (c_filter == -1) {                                                 \
+      threads.x = block_size;                                             \
+      grid.x = grid_size;                                                 \
+      threads.y = threads.z = grid.y = grid.z = 1;                        \
+    }                                                                     \
+    if (data_layout != DataLayout::kNHWC) {                               \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    } else {                                                              \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    }                                                                     \
+    return;                                                               \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 };
 
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
-                                    fuse_relu_before_conv> {
+class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const framework::Tensor& output_grad,
@@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 
     framework::Tensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
-                                       filter.dims()[0], filter.dims()[1]});
+      framework::DDim filter_hwc_dims({filter.dims()[2],
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
           ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
       threads = dim3(std::min(input_channels, thread), blocks, 1);
       grid = dim3((input_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
 
@@ -1090,22 +1425,60 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
            c_filter == -1)) {                                             \
     if (data_layout != DataLayout::kNHWC) {                               \
       KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
-          output_channels, output_height, output_width, input_channels,   \
-          input_height, input_width, filter_multiplier, ksize_height,     \
-          ksize_width, stride_height, stride_width, padding_height,       \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          input_data,                                                     \
+          output_grad_data,                                               \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
     } else {                                                              \
       KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
-          output_channels, output_height, output_width, input_channels,   \
-          input_height, input_width, filter_multiplier, ksize_height,     \
-          ksize_width, stride_height, stride_width, padding_height,       \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          input_data,                                                     \
+          output_grad_data,                                               \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
     }                                                                     \
     return;                                                               \
   }
@@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 };
 
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
+class DepthwiseConvFilterGradFunctor<phi::GPUContext,
+                                     T,
                                      fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output_grad,
                   const std::vector<int>& strides,
@@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
           std::max(block_size / output_channels, 1),
           ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
       grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
       threads = dim3(std::min(output_channels, block_size), blocks, 1);
     }
     int filter_multiplier = output_channels / input_channels;
@@ -1200,22 +1575,41 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
            c_filter == -1)) {                                                  \
     if (data_layout != DataLayout::kNHWC) {                                    \
       KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNCHW,                                                   \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
-          output_height, output_width, input_channels, input_height,           \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
-          stride_height, stride_width, padding_height, padding_width,          \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_grad_data,                                                    \
+          input_data,                                                          \
+          batch_size,                                                          \
+          output_channels,                                                     \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
     } else {                                                                   \
       framework::Tensor filter_grad_hwc;                                       \
       if (c_filter != -1) {                                                    \
-        framework::DDim filter_grad_hwc_dims(                                  \
-            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
-             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+        framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2],          \
+                                              filter_grad->dims()[3],          \
+                                              filter_grad->dims()[0],          \
+                                              filter_grad->dims()[1]});        \
         filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
         filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
-        phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;      \
+        phi::funcs::SetConstant<phi::GPUContext, T> set_zero;                  \
         set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
         filter_grad_data = filter_grad_hwc.data<T>();                          \
       } else {                                                                 \
@@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
         threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
       }                                                                        \
       KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNHWC,                                                   \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
-          output_height, output_width, input_channels, input_height,           \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
-          stride_height, stride_width, padding_height, padding_width,          \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_grad_data,                                                    \
+          input_data,                                                          \
+          batch_size,                                                          \
+          output_channels,                                                     \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
-        phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;     \
+        phi::funcs::TransposeNormal<phi::GPUContext, T> trans;                 \
         trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
       }                                                                        \
     }                                                                          \
@@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
   }
 };
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             false>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, false>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
 
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             true>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
 
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, true>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
new file mode 100644
index 00000000000..4f27b6fde99
--- /dev/null
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides_t,
+                             const std::vector<int>& paddings_t,
+                             const std::string& padding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations_t,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  const DenseTensor* output_grad = &out_grad;
+
+  if (!input_grad && !filter_grad) return;
+
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, input_grad, static_cast<T>(0));
+
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, true>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, false>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    }
+  }
+
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, false>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
new file mode 100644
index 00000000000..c50ceae33fc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/operators/conv_op.h"
+
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  DenseTensor* output = out;
+  output->mutable_data<T>(dev_ctx.GetPlace());
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> paddings = paddings_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  if (channel_last) {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[output->dims().size() - 1] %
+            input.dims()[input.dims().size() - 1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[output->dims().size() - 1],
+            input.dims()[input.dims().size() - 1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input.dims()[1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[1],
+            input.dims()[1]));
+  }
+
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+
+  if (fuse_relu) {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, true>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  } else {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, false>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
new file mode 100644
index 00000000000..b4a6fe337c8
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
@@ -0,0 +1,834 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  auto X = &input;
+  auto W = &filter;
+  auto dO = &out_grad;
+  auto ddX = input_grad_grad.get_ptr();
+  auto ddW = filter_grad_grad.get_ptr();
+
+  auto ddO = out_grad_grad;
+  auto dW = filter_grad;
+  auto dX = input_grad;
+  if (ddO) {
+    ddO->mutable_data<T>(ctx.GetPlace());
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, ddO, static_cast<T>(0));
+  }
+  if (dW) {
+    dW->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dX) {
+    dX->mutable_data<T>(ctx.GetPlace());
+  }
+
+  // const T* x = X->data<T>();
+  const T* dy = dO->data<T>();
+  const T* w = W->data<T>();
+
+  const T* ddx = nullptr;
+  const T* ddw = nullptr;
+  T *dw, *dx, *ddy;
+  dw = dx = ddy = nullptr;
+  T* transformed_dx = nullptr;
+  std::vector<int> dilations = dilations_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  std::vector<int> paddings = paddings_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensors to channel first-----------
+  DenseTensor transformed_X_channel(X->type());
+  DenseTensor transformed_dO_channel(dO->type());
+  DenseTensor transformed_ddX_channel(X->type());
+
+  DenseTensor transformed_ddO_channel(dO->type());
+  DenseTensor transformed_dX_channel(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+    TransToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+    TransToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+      TransToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+    }
+
+    if (ddO) {
+      ResizeToChannelFirst<Context, T>(ctx, ddO, &transformed_ddO_channel);
+    }
+    if (dX) {
+      ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
+      transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
+    }
+
+  } else {
+    transformed_X_channel = *X;
+    transformed_dO_channel = *dO;
+    if (ddX) {
+      transformed_ddX_channel = *ddX;
+    }
+    if (ddO) {
+      transformed_ddO_channel.ShareDataWith(*ddO);
+    }
+    if (dX) {
+      transformed_dX_channel.ShareDataWith(*dX);
+    }
+  }
+
+  auto in_dims = transformed_X_channel.dims();
+  auto filter_dims = W->dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_ddX(X->type());
+
+  DenseTensor transformed_dX(X->type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(X->dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_X_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_X_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_X_channel.dims()[i + 2] + padding_diff[i];
+      input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_X.Resize(new_input_shape);
+    transformed_ddX.Resize(new_input_shape);
+    transformed_dX.Resize(new_input_shape);
+
+    transformed_X.mutable_data<T>(ctx.GetPlace());
+
+    if (ddX) {
+      transformed_ddX.mutable_data<T>(ctx.GetPlace());
+    }
+    if (dX) {
+      transformed_dX.mutable_data<T>(ctx.GetPlace());
+    }
+
+    // pad for input
+    const int rank = X->dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 4>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 5>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+
+  } else {
+    transformed_X.ShareDataWith(transformed_X_channel);
+    if (ddX) {
+      transformed_ddX.ShareDataWith(transformed_ddX_channel);
+    }
+    if (dX) {
+      transformed_dX.ShareDataWith(transformed_dX_channel);
+    }
+
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* x = transformed_X.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  auto handle = ctx.cudnn_handle();
+
+  paddle::operators::ConvArgs args1{&transformed_ddX,
+                                    W,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_X,
+                                    ddW,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args3{&transformed_ddX,
+                                    dW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args4{&transformed_dX,
+                                    ddW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionFwdAlgo_t fwd_algo1 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionFwdAlgo_t fwd_algo2 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
+
+  // ddo = conv(ddI, W) + conv(I, ddW)
+  size_t workspace_size = 0;
+
+  T* transformed_ddy_channel = nullptr;
+  if (ddO) {
+    ddy = ddO->data<T>();
+    transformed_ddy_channel = transformed_ddO_channel.data<T>();
+    if (ddX) {
+      args1.handle = handle;
+      args1.idesc.set(transformed_ddX, iwo_group);
+      args1.wdesc.set(*W, layout, iwo_group);
+      args1.odesc.set(transformed_ddO_channel, iwo_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+
+#ifdef PADDLE_WITH_HIP
+      using search1 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size = search1::GetWorkspaceSize(args1);
+      fwd_algo1 = search1::Find<T>(
+          args1, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search1 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+#endif
+    }
+
+    if (ddW) {
+      ddw = ddW->data<T>();
+      args2.handle = handle;
+      args2.idesc.set(transformed_X, iwo_group);
+      args2.wdesc.set(*ddW, layout, iwo_group);
+      args2.odesc.set(transformed_ddO_channel, iwo_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+
+#ifdef PADDLE_WITH_HIP
+      using search2 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      fwd_algo2 = search2::Find<T>(
+          args2, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search2 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2));
+#endif
+    }
+  }
+
+  if (dW && ddX) {
+    dw = dW->data<T>();
+    args3.handle = handle;
+    args3.idesc.set(transformed_ddX, iwo_group);
+    args3.wdesc.set(*dW, layout, iwo_group);
+    args3.odesc.set(transformed_dO_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+
+#ifdef PADDLE_WITH_HIP
+    using search3 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_algo = search3::Find<T>(
+        args3, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search3 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+#endif
+  }
+
+  if (ddW && dX) {
+    transformed_dx = transformed_dX.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dX, iwo_group);
+    args4.wdesc.set(*ddW, layout, iwo_group);
+    args4.odesc.set(transformed_dO_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+
+#ifdef PADDLE_WITH_HIP
+    using search4 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    data_algo = search4::Find<T>(
+        args4, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search4 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(
+      transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dO_channel.dims(),
+           DataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = W->numel() / groups;
+
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+  // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+  // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+  // 0.0f;
+  // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+  auto wkspace_handle = ctx.cudnn_workspace_handle();
+
+  if (ddO) {
+    if (ddX) {
+      ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args1.idesc.desc(),
+                    ddx,
+                    args1.wdesc.desc(),
+                    w,
+                    args1.cdesc.desc(),
+                    fwd_algo1,
+                    &beta,
+                    args1.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args1.idesc.desc(),
+                      ddx + i * group_offset_in,
+                      args1.wdesc.desc(),
+                      w + i * group_offset_filter,
+                      args1.cdesc.desc(),
+                      fwd_algo1,
+                      workspace_ptr,
+                      workspace_size,
+                      &beta,
+                      args1.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (ddW) {
+#ifdef PADDLE_WITH_HIP
+      // MIOPEN ONLY support beta to be 0.0f
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    x,
+                    args2.wdesc.desc(),
+                    ddw,
+                    args2.cdesc.desc(),
+                    fwd_algo2,
+                    &beta,
+                    args2.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args2.idesc.desc(),
+                      x + i * group_offset_in,
+                      args2.wdesc.desc(),
+                      ddw + i * group_offset_filter,
+                      args2.cdesc.desc(),
+                      fwd_algo2,
+                      workspace_ptr,
+                      workspace_size,
+                      &alpha,
+                      args2.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_ddO_channel, ddO);
+    }
+  }
+  T* transformed_dy_channel = transformed_dO_channel.data<T>();
+  if (dW && ddX) {
+    ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args3.odesc.desc(),
+                  transformed_dy_channel,
+                  args3.idesc.desc(),
+                  ddx,
+                  args3.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args3.wdesc.desc(),
+                  dw,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args3.idesc.desc(),
+                    ddx + i * group_offset_in,
+                    args3.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dw + i * group_offset_filter));
+          },
+          workspace_size);
+    }
+#endif
+  }
+
+  if (dX && ddW) {
+    ddw = ddW->data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardData(
+                  handle,
+                  &alpha,
+                  args4.odesc.desc(),
+                  transformed_dy_channel,
+                  args4.wdesc.desc(),
+                  ddw,
+                  args4.cdesc.desc(),
+                  data_algo,
+                  &beta,
+                  args4.idesc.desc(),
+                  transformed_dx,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args4.wdesc.desc(),
+                    ddw + i * group_offset_filter,
+                    args4.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args4.cdesc.desc(),
+                    data_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args4.idesc.desc(),
+                    transformed_dx + i * group_offset_in));
+          },
+          workspace_size);
+    }
+#endif
+
+    if (!is_sys_pad) {
+      // reverse padded input
+      std::vector<int> starts(X->dims().size(), 0);
+      std::vector<int> axes(X->dims().size(), 0);
+
+      for (size_t i = 0; i < X->dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+      if (X->dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_dX_channel, dX);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void DepthwiseConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    bool fuse_relu,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+#else
+
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+#endif
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
new file mode 100644
index 00000000000..64148e902fd
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
@@ -0,0 +1,683 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnGradKernel(const Context& ctx,
+                         const DenseTensor& output_grad,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search_t,
+                         DenseTensor* input_grad,
+                         DenseTensor* filter_grad) {
+  if (input_grad) {
+    input_grad->mutable_data<T>(ctx.GetPlace());
+  }
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(ctx.GetPlace());
+  }
+
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvGradOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+
+  // transform Tensor
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output_grad_channel(output_grad.type());
+  DenseTensor transformed_input_grad_channel(input.type());
+  DenseTensor transformed_filter_channel(filter.type());
+  DenseTensor transformed_filter_grad_channel(filter.type());
+
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
+               "NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+
+    ResizeToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+    TransToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+
+    if (input_grad) {
+      ResizeToChannelFirst<Context, T>(
+          ctx, input_grad, &transformed_input_grad_channel);
+      // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
+      // the data of input_grad to transformed_input_grad_channel.
+      if (use_addto) {
+        TransToChannelFirst<Context, T>(
+            ctx, input_grad, &transformed_input_grad_channel);
+      }
+    }
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output_grad_channel.ShareDataWith(output_grad);
+    if (input_grad) {
+      transformed_input_grad_channel.ShareDataWith(*input_grad);
+    }
+  }
+
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+
+    if (filter_grad) {
+      ResizeToChannelLast<Context, T>(
+          ctx, filter_grad, &transformed_filter_grad_channel);
+    }
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+    if (filter_grad) {
+      transformed_filter_grad_channel.ShareDataWith(*filter_grad);
+    }
+  }
+
+  //  update paddings
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  // cuDNN only supports padding the same amount on every dimension.
+  // So we create a new padded input tensor.
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  Tensor transformed_input(input.type());
+  Tensor transformed_input_grad(input.type());
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+
+    transformed_input_grad.Resize(new_input_shape);
+
+    if (input_grad) {
+      transformed_input_grad.mutable_data<T>(ctx.GetPlace());
+    }
+    // pad for input
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (input_grad) {
+      transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
+    }
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* input_data = transformed_input.data<T>();
+  const T* output_grad_data = transformed_output_grad_channel.data<T>();
+  const T* filter_data = transformed_filter_channel.data<T>();
+  T* filter_grad_data = nullptr;
+  T* input_grad_data = nullptr;
+  T* transformed_input_grad_data = nullptr;
+
+  paddle::operators::ConvArgs args1{&transformed_input_grad,
+                                    &transformed_filter_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_input,
+                                    &transformed_filter_grad_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+
+  auto handle = ctx.cudnn_handle();
+  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+// ------------------- cudnn backward algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+  // input data workspace_size
+  size_t workspace_size_d = 0;
+  // weight workspace_size
+  size_t workspace_size_w = 0;
+  int iwo_groups = groups;
+  int c_groups = 1;
+
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  if (input_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    input_grad_data = input_grad->data<T>();
+    transformed_input_grad_data = transformed_input_grad.data<T>();
+
+    args1.handle = handle;
+    args1.idesc.set(transformed_input_grad, layout_tensor);
+    args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
+    args1.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
+    data_algo = search1::Find<T>(
+        args1, exhaustive_search, deterministic, workspace_size_d, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo));
+#endif
+  }
+
+  if (filter_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    filter_grad_data = transformed_filter_grad_channel.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_input, layout_tensor);
+    args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups);
+    args2.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size_w =
+        std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
+    filter_algo = search2::Find<T>(
+        args2, exhaustive_search, deterministic, workspace_size_w, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
+    workspace_size_w = std::max(workspace_size_w,
+                                search2::GetWorkspaceSize(args2, filter_algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN ONLY support beta to be 0.0f
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+#else
+  paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+
+#endif
+  VLOG(4) << "Conv_grad: use_addto = " << use_addto;
+
+  if (input_grad) {
+// When beta is 0, it is unnecessary to reset input_grad.
+// When beta is 1, the output cannot be reset since addt strategy used.
+#ifdef PADDLE_WITH_HIP
+    if (use_addto) {
+      DenseTensor temp_tensor(transformed_input_grad.type());
+      temp_tensor.Resize(transformed_input_grad.dims());
+      T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    temp_tensor_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args1.idesc.desc(),
+          transformed_input_grad_data,
+          &alpha,
+          args1.idesc.desc(),
+          temp_tensor_data,
+          &beta,
+          args1.idesc.desc(),
+          transformed_input_grad_data));
+    } else {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+    }
+
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.wdesc.desc(),
+                    filter_data + i * group_offset_filter,
+                    args1.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_d,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data + i * group_offset_in));
+          },
+          workspace_size_d);
+    }
+#endif
+    if (!is_sys_pad) {
+      std::vector<int> starts(transformed_input_channel.dims().size(), 0);
+      std::vector<int> axes(transformed_input_channel.dims().size(), 0);
+
+      for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+
+      transformed_input_grad_channel.mutable_data(ctx.GetPlace());
+      if (transformed_input_channel.dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      }
+    }
+
+    if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+      TransToChannelLast<Context, T>(
+          ctx, &transformed_input_grad_channel, input_grad);
+    }
+  }
+
+  // filter_grad do not use inplace addto.
+  paddle::operators::ScalingParamType<T> beta_filter = 0.0f;
+  // ------------------- cudnn conv backward filter ---------------------
+  if (filter_grad) {
+// Because beta is zero, it is unnecessary to reset filter_grad.
+#ifdef PADDLE_WITH_HIP
+    workspace_handle.RunFunc(
+        [&](void* cudnn_workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args2.odesc.desc(),
+                  output_grad_data,
+                  args2.idesc.desc(),
+                  input_data,
+                  args2.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args2.wdesc.desc(),
+                  filter_grad_data,
+                  cudnn_workspace_ptr,
+                  workspace_size_w));
+        },
+        workspace_size_w);
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    input_data + i * group_offset_in,
+                    args2.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args2.cdesc.desc(),
+                    filter_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_w,
+                    &beta_filter,
+                    args2.wdesc.desc(),
+                    filter_grad_data + i * group_offset_filter));
+          },
+          workspace_size_w);
+    }
+#endif
+
+    if (compute_format == paddle::platform::DataLayout::kNHWC) {
+      TransToChannelFirst<Context, T>(
+          ctx, &transformed_filter_grad_channel, filter_grad);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnGradKernel(const Context& dev_ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& input,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& paddding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           bool use_addto,
+                           int workspace_size_MB,
+                           bool exhaustive_search,
+                           DenseTensor* input_grad,
+                           DenseTensor* filter_grad) {
+  ConvCudnnGradKernel<T>(dev_ctx,
+                         out_grad,
+                         input,
+                         filter,
+                         strides,
+                         paddings,
+                         paddding_algorithm,
+                         groups,
+                         dilations,
+                         data_format,
+                         use_addto,
+                         workspace_size_MB,
+                         exhaustive_search,
+                         input_grad,
+                         filter_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+#endif
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
new file mode 100644
index 00000000000..931b6d68845
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
@@ -0,0 +1,476 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings_t,
+                     const std::string& padding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations_t,
+                     const std::string& data_format,
+                     bool use_addto,
+                     int workspace_size_MB,
+                     bool exhaustive_search_t,
+                     DenseTensor* output) {
+  output->mutable_data<T>(ctx.GetPlace());
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  // Tensor Core introduced from Volta GPUs supports more faster conv op
+  // with FP16 in NHWC data format.
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  // We will only do data format conversion from NHWC to NCHW.
+  // cudnn will convert NCHW to NHWC automatically on Tensor Core.
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+
+  // ------------ transformed tensor -----------
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_filter_channel(filter.type());
+  T* output_data = nullptr;
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, output, &transformed_output);
+
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output.ShareDataWith(*output);
+  }
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+  }
+  output_data = transformed_output.data<T>();
+
+  // update padding and dilation
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+
+  DenseTensor transformed_input;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+
+    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* input_data = transformed_input.data<T>();
+
+  const T* filter_data = transformed_filter_channel.data<T>();
+
+  // ------------------- cudnn descriptors ---------------------
+  paddle::operators::ConvArgs args{&transformed_input,
+                                   &transformed_filter_channel,
+                                   &transformed_output,
+                                   strides,
+                                   padding_common,
+                                   dilations,
+                                   dtype};
+
+  auto handle = ctx.cudnn_handle();
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
+
+  args.handle = handle;
+
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN need to set groups in cdesc in miopen_desc.h
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn(),
+                 groups);
+#else
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn());
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // cudnn 7 can support groups, no need to do it manually
+  // FIXME(typhoonzero): find a better way to disable groups
+  // rather than setting it to 1.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetConvolutionGroupCount(
+          args.cdesc.desc(), groups));
+  groups = 1;
+#endif
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN do not set groups in wdesc after set groups in cdesc
+  groups = 1;
+#endif
+  args.idesc.set(transformed_input, layout_format);
+  args.wdesc.set(transformed_filter_channel, layout_format, groups);
+  args.odesc.set(transformed_output, layout_format);
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+  // ------------------- cudnn conv workspace ---------------------
+  size_t workspace_size = 0;  // final workspace to allocate.
+// ------------------- cudnn conv algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t algo{};
+  using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+  workspace_size = search::GetWorkspaceSize(args);
+  algo = search::Find<T>(
+      args, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+  cudnnConvolutionFwdAlgo_t algo{};
+  using search =
+      paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+  algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
+  workspace_size = search::GetWorkspaceSize(args, algo);
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
+    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
+  // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
+    // FWD_ALGO_IMPLICIT_GEMM manually.
+  if (groups > 1) {
+    algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  }
+#endif
+
+  // ------------------- cudnn conv forward ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+
+#ifdef PADDLE_WITH_HIP
+  workspace_handle.RunFunc(
+      [&](void* workspace_ptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args.idesc.desc(),
+                input_data,
+                args.wdesc.desc(),
+                filter_data,
+                args.cdesc.desc(),
+                algo,
+                &beta,
+                args.odesc.desc(),
+                output_data,
+                workspace_ptr,
+                workspace_size));
+      },
+      workspace_size);
+#else
+  for (int i = 0; i < groups; i++) {
+    workspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::cudnnConvolutionForward(
+                  handle,
+                  &alpha,
+                  args.idesc.desc(),
+                  input_data + i * group_offset_in,
+                  args.wdesc.desc(),
+                  filter_data + i * group_offset_filter,
+                  args.cdesc.desc(),
+                  algo,
+                  workspace_ptr,
+                  workspace_size,
+                  &beta,
+                  args.odesc.desc(),
+                  output_data + i * group_offset_out));
+        },
+        workspace_size);
+  }
+#endif
+
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    TransToChannelLast<Context, T>(ctx, &transformed_output, output);
+  }
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& filter,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const std::string& padding_algorithm,
+                       int groups,
+                       const std::vector<int>& dilations,
+                       const std::string& data_format,
+                       bool use_addto,
+                       int workspace_size_MB,
+                       bool exhaustive_search,
+                       DenseTensor* out) {
+  ConvCudnnKernel<T>(dev_ctx,
+                     input,
+                     filter,
+                     strides,
+                     paddings,
+                     padding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+
+#endif
+
+// todo register bfloat16
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
new file mode 100644
index 00000000000..93bc5b64adc
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+DECLARE_bool(cudnn_deterministic);
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+
+namespace phi {
+
+static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) {
+  return dev_ctx.GetComputeCapability() >= 70;
+}
+
+// inline cudnnTensorFormat_t GetCudnnTensorFormat(
+//     const phi::DataLayout& order) {  // Not use
+//   switch (order) {
+//     case phi::DataLayout::kNHWC:
+//       return CUDNN_TENSOR_NHWC;
+//     case phi::DataLayout::kNCHW:
+//       return CUDNN_TENSOR_NCHW;
+//     case phi::DataLayout::NCDHW:
+//       return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+//     case phi::DataLayout::NDHWC:
+//       return CUDNN_TENSOR_NHWC;  // add, liyamei
+//     default:
+//       PADDLE_THROW(phi::errors::Unimplemented(
+//           "CUDNN has no equivalent dataLayout for input order."));
+//   }
+//   return CUDNN_TENSOR_NCHW;
+// }
+
+static inline void GetNCDHW(const DDim& dims,
+                            const phi::DataLayout& layout,
+                            int* N,
+                            int* C,
+                            int* D,
+                            int* H,
+                            int* W) {
+  *N = dims[0];
+  *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+  int i = layout == phi::DataLayout::kNCHW ? 0 : 1;
+  if (dims.size() == 5) {
+    *D = dims[2 - i];
+    *H = dims[3 - i];
+    *W = dims[4 - i];
+  } else {
+    *D = 1;
+    *H = dims[2 - i];
+    *W = dims[3 - i];
+  }
+}
+
+}  // namespace phi
+
+// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double
+// ) {}
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
new file mode 100644
index 00000000000..fbcebf371a6
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -0,0 +1,330 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides_t,
+                        const std::vector<int>& paddings_t,
+                        const std::string& padding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations_t,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad) {
+  const DenseTensor* X = &input;
+  const DenseTensor* dY = &out_grad;
+  const DenseTensor* ddX = input_grad_grad.get_ptr();
+  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
+
+  DenseTensor* ddY = out_grad_grad;
+  DenseTensor* dW = filter_grad;
+  DenseTensor* dX = input_grad;
+  DenseTensor W = filter;
+
+  if (!ddY && !dW && !dX) return;
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensor
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_dY(dY->type());
+  DenseTensor transformed_ddX(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+    }
+  } else {
+    transformed_X = *X;
+    transformed_dY = *dY;
+    if (ddX) {
+      transformed_ddX = *ddX;
+    }
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_X.dims();
+  auto filter_dims = W.dims();
+
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
+  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  // col_shape [in_channel/group, kh, kw, oh, ow]
+  col_shape_vec[0] = transformed_X.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+  // input_shape [Cin, H, W]
+  DDim input_shape =
+      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
+  // filter_matrix_shape [Cout, Cin * kh * kw]
+  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
+
+  W.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_dY.dims()[1],
+      transformed_dY.numel() /
+          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
+  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  // dx convolution double grad:  gemm + col2im(col2vol)
+  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
+  // oH, oW)
+  if (dX && ddW_in) {
+    Tensor ddW;
+    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+    dX->mutable_data<T>(dev_ctx.GetPlace());
+
+    DenseTensor transformed_dX(dX->type());
+
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
+
+    } else {
+      transformed_dX = *dX;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary
+    // because math::matmul will reset dx
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col_matrix.ShareDataWith(dx_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(
+            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &dx_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
+    }
+  }
+
+  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
+  // oH, oW)
+  // dw convolution double grad:  im2col(vol2col) + gemm
+  if (dW && ddX) {
+    dW->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, dW, static_cast<T>(0));
+    DenseTensor dW_arr = *dW;
+    dW_arr.Resize(filter_matrix_shape);
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; ++g) {
+        // im2col
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col.ShareDataWith(ddx_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 ddx_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+        }
+
+        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(
+            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
+      }
+    }
+  }
+
+  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
+  // w/ddw(Cout, Cin, kh, kw)
+  // ddy convolution double grad: im2col(vol2col) + gemm
+  if (ddY) {
+    ddY->mutable_data<T>(dev_ctx.GetPlace());
+
+    DenseTensor transformed_ddY(ddY->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
+    } else {
+      transformed_ddY = *ddY;
+    }
+
+    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor ddy_batch =
+          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
+      for (int g = 0; g < groups; ++g) {
+        // gemm
+        DenseTensor ddy_slice =
+            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        if (ddX) {
+          DenseTensor ddx_batch =
+              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor ddx_slice =
+              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (!is_expand) {
+            col.ShareDataWith(ddx_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   ddx_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+          }
+          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
+        }
+
+        if (ddW_in) {
+          DenseTensor x_batch =
+              transformed_X.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          DenseTensor ddW;
+          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+          if (!is_expand) {
+            col.ShareDataWith(x_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   x_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
+          }
+
+          // gemm
+          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
new file mode 100644
index 00000000000..f1971aca800
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -0,0 +1,257 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& output_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter_t,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings_t,
+                    const std::string& padding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations_t,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad) {
+  // The filter and filter_grad will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+
+  if (!input_grad && !filter_grad) return;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  DenseTensor filter = filter_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output_grad(output_grad.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+
+    ResizeToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+    TransToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+  } else {
+    transformed_input = input;
+    transformed_output_grad = output_grad;
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation<int>(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(
+      vectorize(transformed_output_grad.dims()));
+
+  // use col_shape in the im2col calculation
+  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+  // o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = transformed_input.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (i_c/g * k_h * k_w, o_h * o_w)
+  // or
+  // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  DDim input_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+
+  DDim output_matrix_shape = {
+      transformed_output_grad.dims()[1],
+      transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
+                                         transformed_output_grad.dims()[1])};
+
+  // convolution backward input operator:  gemm + col2im(or col2vol)
+  // convolution backward weight operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    DenseTensor transformed_input_grad(input_grad->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, input_grad, &transformed_input_grad);
+
+    } else {
+      transformed_input_grad = *input_grad;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary,
+    // because math::matmul will reset input_grad.
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_grad_batch =
+          transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor filter_slice =
+            filter.Slice(g * out_step, (g + 1) * out_step);
+
+        DenseTensor in_grad_slice =
+            in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col_matrix.ShareDataWith(in_grad_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(filter_slice,
+                    true,
+                    out_grad_slice,
+                    false,
+                    T(1.0),
+                    &col_matrix,
+                    T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &in_grad_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_input_grad, input_grad);
+    }
+  }
+
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    Tensor filter_grad_ = *filter_grad;
+    filter_grad_.Resize(filter_matrix_shape);
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_batch =
+          transformed_input.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // im2col
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 in_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+        }
+
+        // gemm
+        DenseTensor filter_grad_slice =
+            filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(out_grad_slice,
+                    false,
+                    col_matrix,
+                    true,
+                    T(1.0),
+                    &filter_grad_slice,
+                    T(1.0));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h
new file mode 100644
index 00000000000..1945468f025
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter_t,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings_t,
+                const std::string& padding_algorithm,
+                int groups,
+                const std::vector<int>& dilations_t,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* output) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  DenseTensor filter = filter_t;
+  // The filter will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  output->mutable_data<T>(dev_ctx.GetPlace());
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output(output->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, output, &transformed_output);
+
+  } else {
+    transformed_input = input;
+    transformed_output = *output;
+  }
+
+  // update padding and dilation
+  auto trans_in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+  // filter_shape_vec:
+  // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+
+  // output_shape_vec:
+  // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_output.dims()));
+
+  // use col_shape in the im2col calculation
+  // col_shape_vec:
+  // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
+  // o_d,o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = trans_in_dims[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size:
+  // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
+  // o_w)
+
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    // col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  DDim in_matrix_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+
+  DDim output_matrix_shape = {
+      transformed_output.dims()[1],
+      transformed_output.numel() /
+          (transformed_output.dims()[0] * transformed_output.dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
+
+  paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+  paddle::operators::math::
+      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          im2col;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor in_batch =
+        transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
+    DenseTensor out_batch =
+        transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        im2col(dev_ctx,
+               in_slice,
+               dilations,
+               strides,
+               std::vector<int>{
+                   paddings[0], paddings[2], paddings[1], paddings[3]},
+               &col);
+
+      } else if (data_dim == 3U) {
+        vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      blas.MatMul(
+          filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0));
+    }
+  }
+  if (channel_last) {
+    TransToChannelLast<Context, T>(dev_ctx, &transformed_output, output);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc
new file mode 100644
index 00000000000..a755fdb19ec
--- /dev/null
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+
+KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad,
+                           phi::Conv2dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
new file mode 100644
index 00000000000..a036afac82a
--- /dev/null
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+
+KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv3dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad,
+                           phi::Conv3dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
new file mode 100644
index 00000000000..e2b6801f73b
--- /dev/null
+++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DepthwiseConv2dOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"Output"});
+}
+
+KernelSignature DepthwiseConv2dGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d,
+                           phi::DepthwiseConv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad,
+                           phi::DepthwiseConv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad,
+                           phi::DepthwiseConv2dDoubleGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index dc460cb16f6..ca77177125f 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index f933d5bf7a4..892fa649a6c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
 import unittest
+import paddle
 
 
 def _reverse_repeat_list(t, n):
@@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 826f886dab1..6a9f7a47f66 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp):
         self.groups = 3
 
 
-#----------------Conv2DCUDNN----------------
+# #----------------Conv2DCUDNN----------------
 
 create_test_cudnn_class(TestConv2DOp)
 create_test_cudnn_class(TestWithPad)
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 5f23d04dde5..8cf779ccfdd 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 def conv3d_forward_naive(input,
@@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 81c6aa1fd17..784d89b93f9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase):
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
-        places = []
 
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
             [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
-        places = [fluid.CPUPlace()]
+        #places = [fluid.CPUPlace()]
+        places = []
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
@@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index cec48724da2..8e0a744ecdb 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 8ccaf30cbdb..6c208160658 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index 9b739ebdfb2..d391b04aa47 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 33f304ef33d..0a08aa4ba12 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From c47ae621c31aa94001c4d1d8e55ca4230aa4a25f Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 4 Mar 2022 10:11:36 +0800
Subject: [PATCH 106/272] add eager test in rnn and fc; test=develop (#40149)

---
 .../tests/unittests/test_imperative_deepcf.py | 34 +++++++++++++++++++
 .../test_imperative_recurrent_usage.py        | 24 +++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index 04a0e5e4cd1..3e222e3c658 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -24,6 +24,7 @@ import paddle.fluid.core as core
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph import Linear
+from paddle.fluid.framework import _test_eager_guard
 
 # Can use Amusic dataset as the DeepCF describes.
 DATA_PATH = os.environ.get('DATA_PATH', '')
@@ -294,9 +295,42 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('dynamic loss: %s %s\n' %
                                      (slice, dy_loss2))
 
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                paddle.seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+
+                deepcf = DeepCF(num_users, num_items, matrix)
+                adam = fluid.optimizer.AdamOptimizer(
+                    0.01, parameter_list=deepcf.parameters())
+
+                for e in range(NUM_EPOCHES):
+                    sys.stderr.write('epoch %d\n' % e)
+                    for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                        if slice + BATCH_SIZE >= users_np.shape[0]:
+                            break
+                        prediction = deepcf(
+                            to_variable(users_np[slice:slice + BATCH_SIZE]),
+                            to_variable(items_np[slice:slice + BATCH_SIZE]))
+                        loss = fluid.layers.reduce_sum(
+                            fluid.layers.log_loss(prediction,
+                                                  to_variable(
+                                                      labels_np[slice:slice +
+                                                                BATCH_SIZE])))
+                        loss.backward()
+                        adam.minimize(loss)
+                        deepcf.clear_gradients()
+                        eager_loss = loss.numpy()
+                        sys.stderr.write('eager loss: %s %s\n' %
+                                         (slice, eager_loss))
+
         self.assertEqual(static_loss, dy_loss)
         self.assertEqual(static_loss, dy_loss2)
+        self.assertEqual(static_loss, eager_loss)
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
index d0b3adc4909..f12ca0a93ff 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
@@ -16,9 +16,11 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
@@ -60,6 +62,25 @@ class TestRecurrentFeed(unittest.TestCase):
                 original_in1.stop_gradient = True
                 rt.clear_gradients()
 
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+                original_in1 = to_variable(original_np1)
+                original_in2 = to_variable(original_np2)
+                original_in1.stop_gradient = False
+                original_in2.stop_gradient = False
+                rt = RecurrentTest("RecurrentTest")
+
+                for i in range(3):
+                    sum_out, out = rt(original_in1, original_in2)
+                    original_in1 = out
+                    eager_sum_out_value = sum_out.numpy()
+                    sum_out.backward()
+                    eager_dyout = out.gradient()
+                    original_in1.stop_gradient = True
+                    rt.clear_gradients()
+
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -88,8 +109,11 @@ class TestRecurrentFeed(unittest.TestCase):
                 original_np1 = static_out_value
 
         self.assertTrue(np.array_equal(static_sum_out, sum_out_value))
+        self.assertTrue(np.array_equal(static_sum_out, eager_sum_out_value))
         self.assertTrue(np.array_equal(static_dout, dyout))
+        self.assertTrue(np.array_equal(static_dout, eager_dyout))
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From 73a4fe6cbe5222c42a3d750441e12f1316a3a95d Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Fri, 4 Mar 2022 10:11:53 +0800
Subject: [PATCH 107/272] extend test_imperative_qat_user_defined test time
 (#40114)

---
 python/paddle/fluid/contrib/slim/tests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 807f7c15196..49ae8f5fd56 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -355,6 +355,8 @@ set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
 set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
+
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
     set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
-- 
GitLab


From d2a911b46be80a01ef685f0bbc2bdffb683316b1 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 4 Mar 2022 10:13:08 +0800
Subject: [PATCH 108/272] [Yaml]Support parsing fwd & bwd returns with name
 (#40107)

---
 .../final_state_generator/eager_gen.py        | 41 +++++++------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 65dbb0368c6..4945a6fb654 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -208,39 +208,26 @@ def ParseYamlArgs(string):
 
 
 def ParseYamlReturns(string):
-    # Example: Tensor, Tensor
-
-    # list = [ ["", ret_type, orig_position], ...]
-    returns_list = []
-
-    returns = [x.strip() for x in string.strip().split(",")]
-    for i in range(len(returns)):
-        ret_type = returns[i]
-
-        assert ret_type in yaml_types_mapping.keys()
-        ret_type = yaml_types_mapping[ret_type]
-
-        returns_list.append(["", ret_type, i])
-
-    return returns_list
-
-
-def ParseYamlReturnsWithName(string):
-    # Example: Tensor(out), Tensor(out1)
+    # Example0: Tensor(out), Tensor(out1)
+    # Example1: Tensor, Tensor
+    # Example2: Tensor[](out), Tensor
 
     # list = [ [ret_name, ret_type, orig_position], ...]
     returns_list = []
 
     returns = [x.strip() for x in string.strip().split(",")]
 
-    atype = r'(.*?)'
-    aname = r'(.*?)'
-    pattern = f'{atype}\({aname}\)'
     for i in range(len(returns)):
         ret = returns[i]
-        m = re.search(pattern, ret)
-        ret_type = m.group(1)
-        ret_name = m.group(2)
+
+        ret_name = ""
+        if "(" in ret and ")" in ret:
+            # Remove trailing ')'
+            ret = ret[:-1]
+            ret_type = ret.split("(")[0].strip()
+            ret_name = ret.split("(")[1].strip()
+        else:
+            ret_type = ret.strip()
 
         assert ret_type in yaml_types_mapping.keys()
         ret_type = yaml_types_mapping[ret_type]
@@ -266,7 +253,7 @@ def ParseYamlForwardFromBackward(string):
     function_returns = m.group(3)
 
     forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
-    forward_returns_list = ParseYamlReturnsWithName(function_returns)
+    forward_returns_list = ParseYamlReturns(function_returns)
 
     return forward_inputs_list, forward_attrs_list, forward_returns_list
 
@@ -296,7 +283,7 @@ def ParseYamlBackward(args_str, returns_str):
     args_str = re.search(args_pattern, args_str).group(1)
 
     inputs_list, attrs_list = ParseYamlArgs(args_str)
-    returns_list = ParseYamlReturnsWithName(returns_str)
+    returns_list = ParseYamlReturns(returns_str)
 
     return inputs_list, attrs_list, returns_list
 
-- 
GitLab


From 50d5bf7959e660fff3d49d70fa73e1f3b132c0c2 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 4 Mar 2022 10:35:30 +0800
Subject: [PATCH 109/272] [Phi] Change input vec tensor to pointer type
 (#40078)

* change input vec tensor to pointer

* update input between

* fix format error

* resolve conflict

* resolve conflict
---
 paddle/infrt/host_context/value.h             |  2 +-
 paddle/phi/api/lib/api_gen_utils.cc           |  6 +--
 paddle/phi/api/lib/api_gen_utils.h            |  2 +-
 paddle/phi/core/kernel_context.h              |  9 ++---
 paddle/phi/core/kernel_registry.h             |  4 +-
 paddle/phi/core/kernel_utils.h                | 40 +++++++++----------
 .../kernels/broadcast_tensors_grad_kernel.h   |  2 +-
 paddle/phi/kernels/broadcast_tensors_kernel.h |  2 +-
 paddle/phi/kernels/concat_kernel.h            |  8 ++--
 .../cpu/broadcast_tensors_grad_kernel.cc      |  4 +-
 paddle/phi/kernels/cpu/concat_kernel.cc       | 31 +++++++-------
 .../gpu/broadcast_tensors_grad_kernel.cu      |  4 +-
 paddle/phi/kernels/gpu/concat_kernel.cu       | 30 +++++++-------
 .../impl/broadcast_tensors_kernel_impl.h      | 10 ++---
 paddle/phi/tests/core/test_custom_kernel.cc   |  2 +-
 .../phi/tests/kernels/test_concat_dev_api.cc  |  2 +-
 python/paddle/utils/code_gen/api_base.py      | 28 +++++++++++--
 17 files changed, 103 insertions(+), 83 deletions(-)

diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 7e7d77d3af7..0ae482349cd 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -70,7 +70,7 @@ using ValueVariantType =
             backends::CpuPhiAllocator,
             backends::CpuPhiContext,
             ::phi::CPUContext,
-            std::vector<phi::DenseTensor>,
+            std::vector<const phi::DenseTensor*>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
             std::vector<phi::MetaTensor*>,
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index f04e74b45fc..e1ebe8c6465 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -71,11 +71,11 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
 }
 
 std::vector<phi::MetaTensor> MakeMetaTensor(
-    const std::vector<phi::DenseTensor>& tensors) {
+    const std::vector<const phi::DenseTensor*>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
   meta_tensors.reserve(tensors.size());
-  for (const auto& t : tensors) {
-    meta_tensors.emplace_back(t);
+  for (const auto* t : tensors) {
+    meta_tensors.emplace_back(*t);
   }
   return meta_tensors;
 }
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 109c6e7ab71..01625f651c3 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -51,7 +51,7 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::DenseTensor&>& tensor);
 
 std::vector<phi::MetaTensor> MakeMetaTensor(
-    const std::vector<phi::DenseTensor>& tensors);
+    const std::vector<const phi::DenseTensor*>& tensors);
 
 phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
 
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 57e2db60c24..213ac47d30b 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -82,12 +82,11 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
-    std::vector<TensorType> v;
+  std::vector<const TensorType*> InputsBetween(size_t start, size_t end) {
+    std::vector<const TensorType*> v;
     for (size_t i = start; i < end; ++i) {
-      auto t = static_cast<const TensorType*>(inputs_.at(i));
-      v.emplace_back(*t);
-      inputs_[i] = nullptr;
+      auto* t = static_cast<const TensorType*>(inputs_.at(i));
+      v.emplace_back(t);
     }
     return v;
   }
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 2b04d173af0..35e170a3fce 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -87,8 +87,8 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-      } else if (arg_type ==
-                 std::type_index(typeid(const std::vector<DenseTensor>&))) {
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<const DenseTensor*>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index b582375155a..f7fa27b0744 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -102,26 +102,26 @@ namespace phi {
     }                                                                      \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
-  template <typename... Tail>                                              \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
-    template <int dev_ctx_idx,                                             \
-              int in_idx,                                                  \
-              int attr_idx,                                                \
-              int out_idx,                                                 \
-              typename... PreviousArgs>                                    \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
-      static_assert(attr_idx == 0,                                         \
-                    "Kernel's Input should appear before Attributes.");    \
-      static_assert(out_idx == 0,                                          \
-                    "Kernel's Input should appear before Outputs.");       \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
-      std::vector<tensor_type> arg = std::move(                            \
-          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
-      KernelCallHelper<Tail...>::                                          \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
-              ctx, pargs..., arg);                                         \
-    }                                                                      \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
+  template <typename... Tail>                                                \
+  struct KernelCallHelper<const std::vector<const tensor_type*>&, Tail...> { \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {        \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);           \
+      std::vector<const tensor_type*> arg = std::move(                       \
+          ctx->InputsBetween<tensor_type>(range.first, range.second));       \
+      KernelCallHelper<Tail...>::                                            \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
   }
 
 #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
index 5ec2e35cc9b..5d24f6684a4 100644
--- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
-                                const std::vector<DenseTensor>& dout,
+                                const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
index fb2a6f1136c..22b5201b690 100644
--- a/paddle/phi/kernels/broadcast_tensors_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsKernel(const Context& ctx,
-                            const std::vector<DenseTensor>& x,
+                            const std::vector<const DenseTensor*>& x,
                             std::vector<DenseTensor*> out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index f1366788146..ed969e963ec 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -22,19 +22,19 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis,
                   DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Concat(const Context& dev_ctx,
-                   const std::vector<DenseTensor>& x,
+                   const std::vector<const DenseTensor*>& x,
                    const Scalar& axis) {
   std::vector<MetaTensor> meta_x;
   meta_x.reserve(x.size());
   std::vector<MetaTensor*> meta_x_ptr;
-  for (const auto& t : x) {
-    meta_x.emplace_back(t);
+  for (const auto* t : x) {
+    meta_x.emplace_back(*t);
     meta_x_ptr.push_back(&meta_x.back());
   }
 
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 7a97f8c2189..0869cd62024 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -59,7 +59,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
-                                const std::vector<DenseTensor>& dout,
+                                const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx) {
   // Find reduce dimensions
   const auto& in_tensors = dout;
@@ -85,7 +85,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
   // For each In-Out tensor pair,
   // Prepare and apply broadcast dims array
   for (size_t i = 0; i < num_ins; i++) {
-    const auto* input_tensor = &in_tensors[i];
+    const auto* input_tensor = in_tensors[i];
     auto* output_tensor = out_tensors[i];
 
     const auto& input_dims = input_tensor->dims();
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 5c4202837c4..6be825d4ef1 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -29,17 +29,17 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis_scalar,
                   DenseTensor* out) {
   int64_t axis = axis_scalar.to<int64_t>();
 
-  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
 
   std::vector<phi::DDim> x_dims;
   x_dims.reserve(x.size());
   for (size_t i = 0; i < x.size(); ++i) {
-    x_dims.push_back(x[i].dims());
+    x_dims.push_back(x[i]->dims());
   }
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
@@ -47,13 +47,13 @@ void ConcatKernel(const Context& dev_ctx,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
-  if (axis == 0 && x[0].lod().size() > 0) {
-    size_t lod_size_0 = x[0].lod().size();
+  if (axis == 0 && x[0]->lod().size() > 0) {
+    size_t lod_size_0 = x[0]->lod().size();
     size_t lod_size = lod_size_0;
     for (size_t i = 1; i < x.size(); ++i) {
-      if (x[i].lod().size() > 0) {
+      if (x[i]->lod().size() > 0) {
         PADDLE_ENFORCE_EQ(
-            x[i].lod().size(),
+            x[i]->lod().size(),
             lod_size_0,
             phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
@@ -61,7 +61,7 @@ void ConcatKernel(const Context& dev_ctx,
                 "it is not supported currently. The lod level of %dth input "
                 "is %d and first input is %d.",
                 i,
-                x[i].lod().size(),
+                x[i]->lod().size(),
                 lod_size_0));
       } else {
         lod_size = 0;
@@ -71,7 +71,7 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod());
+        auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod());
         phi::AppendLoD(out_lod, in_lod);
       }
     }
@@ -80,28 +80,29 @@ void ConcatKernel(const Context& dev_ctx,
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && x.size() < 10) {
     size_t output_offset = 0;
-    for (auto& in : x) {
-      if (in.numel() == 0UL) {
+    for (const auto* in : x) {
+      if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in.dims());
+      auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
       paddle::operators::StridedNumelCopyWithAxis<T>(
           dev_ctx,
           axis,
           out->data<T>() + output_offset,
           out_stride,
-          in.data<T>(),
+          in->data<T>(),
           in_stride,
           in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
+    // TODO(chenweihang): concat functor support vector<DenseTensor*> input
     std::vector<phi::DenseTensor> inputs;
     inputs.reserve(x.size());
     for (size_t j = 0; j < x.size(); ++j) {
-      if (x[j].numel() > 0) {
-        inputs.emplace_back(x[j]);
+      if (x[j]->numel() > 0) {
+        inputs.emplace_back(*x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 6fb24d72145..275b8411ccc 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -27,7 +27,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
-                                const std::vector<DenseTensor>& dout,
+                                const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx) {
   // Find reduce dimensions
   const auto& in_tensors = dout;
@@ -54,7 +54,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
   // For each In-Out tensor pair,
   // Prepare and apply broadcast dims array
   for (size_t i = 0; i < num_ins; i++) {
-    auto* input_tensor = &in_tensors[i];
+    auto* input_tensor = in_tensors[i];
     auto* output_tensor = out_tensors[i];
 
     const DDim& input_dims = input_tensor->dims();
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 2b04b979c20..accb1cc3d77 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -29,16 +29,16 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis_scalar,
                   DenseTensor* out) {
   int64_t axis = axis_scalar.to<int64_t>();
 
-  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
 
   std::vector<phi::DDim> x_dims;
   for (size_t i = 0; i < x.size(); ++i) {
-    x_dims.push_back(x[i].dims());
+    x_dims.push_back(x[i]->dims());
   }
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
@@ -46,13 +46,13 @@ void ConcatKernel(const Context& dev_ctx,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
-  if (axis == 0 && x[0].lod().size() > 0) {
-    size_t lod_size_0 = x[0].lod().size();
+  if (axis == 0 && x[0]->lod().size() > 0) {
+    size_t lod_size_0 = x[0]->lod().size();
     size_t lod_size = lod_size_0;
     for (size_t i = 1; i < x.size(); ++i) {
-      if (x[i].lod().size() > 0) {
+      if (x[i]->lod().size() > 0) {
         PADDLE_ENFORCE_EQ(
-            x[i].lod().size(),
+            x[i]->lod().size(),
             lod_size_0,
             phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
@@ -60,7 +60,7 @@ void ConcatKernel(const Context& dev_ctx,
                 "it is not supported currently. The lod level of %dth input "
                 "is %d and first input is %d.",
                 i,
-                x[i].lod().size(),
+                x[i]->lod().size(),
                 lod_size_0));
       } else {
         lod_size = 0;
@@ -70,7 +70,7 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod());
+        auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod());
         phi::AppendLoD(out_lod, in_lod);
       }
     }
@@ -79,18 +79,18 @@ void ConcatKernel(const Context& dev_ctx,
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && x.size() < 10) {
     size_t output_offset = 0;
-    for (auto& in : x) {
-      if (in.numel() == 0UL) {
+    for (auto* in : x) {
+      if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in.dims());
+      auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
       paddle::operators::StridedNumelCopyWithAxis<T>(
           dev_ctx,
           axis,
           out->data<T>() + output_offset,
           out_stride,
-          in.data<T>(),
+          in->data<T>(),
           in_stride,
           in_stride[axis]);
       output_offset += in_stride[axis];
@@ -98,8 +98,8 @@ void ConcatKernel(const Context& dev_ctx,
   } else {
     std::vector<phi::DenseTensor> inputs;
     for (size_t j = 0; j < x.size(); ++j) {
-      if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+      if (x[j]->numel() > 0) {
+        inputs.push_back(*x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index eb01b83377c..d7167704a48 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -23,10 +23,10 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#define SWITCH_OUT_RANK_CASE(n)                                         \
-  case n: {                                                             \
-    ApplyBroadcast<T, Context, n>(ctx, &in_tensors[i], out_tensors[i]); \
-    break;                                                              \
+#define SWITCH_OUT_RANK_CASE(n)                                        \
+  case n: {                                                            \
+    ApplyBroadcast<T, Context, n>(ctx, in_tensors[i], out_tensors[i]); \
+    break;                                                             \
   }
 
 namespace phi {
@@ -75,7 +75,7 @@ void ApplyBroadcast(const Context& ctx,
 
 template <typename T, typename Context>
 void BroadcastTensorsKernel(const Context& ctx,
-                            const std::vector<DenseTensor>& x,
+                            const std::vector<const DenseTensor*>& x,
                             std::vector<DenseTensor*> out) {
   const auto& in_tensors = x;
   auto out_tensors = out;
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index 69922c055cb..a4e89231e14 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -43,7 +43,7 @@ template <typename T, typename Context>
 void FakeDot(const Context& dev_ctx,
              const phi::DenseTensor& x,
              const phi::DenseTensor& y,
-             const std::vector<phi::DenseTensor>& fake_input_vec,
+             const std::vector<const phi::DenseTensor*>& fake_input_vec,
              bool fake_attr_bool,
              int fake_attr_int,
              float fake_attr_float,
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 55dd6dce1aa..7f954085f60 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -53,7 +53,7 @@ TEST(DEV_API, concat) {
     }
   }
 
-  std::vector<phi::DenseTensor> inputs = {dense_x, dense_y};
+  std::vector<const phi::DenseTensor*> inputs = {&dense_x, &dense_y};
 
   // 2. test API
   phi::CPUContext dev_ctx;
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 6c07cdec2ee..601248a4176 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -458,7 +458,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                 elif self.inputs['input_info'][
                         param] == "const std::vector<Tensor>&":
                     meta_tensor_code = meta_tensor_code + f"""
-{code_indent}  auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param});
+{code_indent}  auto {param}_meta_vec = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
 {code_indent}  std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
 {code_indent}  for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
 {code_indent}    {param}_metas[i] = &{param}_meta_vec[i];
@@ -502,7 +502,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         input_trans_map = {
             'const Tensor&': 'const phi::DenseTensor&',
             'const std::vector<Tensor>&':
-            'const std::vector<phi::DenseTensor>&',
+            'const std::vector<const phi::DenseTensor*>&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::DenseTensor&>',
             'const paddle::optional<std::vector<Tensor>>&':
@@ -539,9 +539,22 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  }}"""
 
                 else:
-                    input_tensor_code = input_tensor_code + f"""
+                    if self.inputs['input_info'][input_name] == "const Tensor&":
+                        input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        input_tensor_code = input_tensor_code + f"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
+{code_indent}  std::vector<const phi::DenseTensor*> {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size());
+{code_indent}  for (size_t i = 0; i < {PREFIX_TENSOR_NAME}{input_name}.size(); ++i) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name}[i] = &{PREFIX_TENSOR_NAME}{input_name}_vec->at(i);
+{code_indent}  }}"""
+
+                    else:
+                        # do nothing
+                        pass
             else:
                 if input_name in self.optional_vars:
                     input_tensor_code = input_tensor_code + f"""
@@ -561,7 +574,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                 if param in self.optional_vars:
                     kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
                 else:
-                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    if self.inputs['input_info'][param] == "const Tensor&":
+                        kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                    else:
+                        # do nothing
+                        pass
                 kernel_in_type = input_trans_map[input_infos[param]]
                 kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
-- 
GitLab


From caa61990dd3c954c591aa24f7b5791c7fb8af545 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 4 Mar 2022 10:52:08 +0800
Subject: [PATCH 110/272] Fix develop whl package not found (#40016)

---
 paddle/scripts/paddle_build.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 41e5e0469dc..175b4be295e 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -330,7 +330,7 @@ function check_style() {
 
     # pre-commit use python3.8.0 
     OLD_PATH=$PATH
-    export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+    export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
 
     pre-commit install
     clang-format --version
@@ -2754,17 +2754,20 @@ function build_pr_and_develop() {
     fi
 
     git fetch upstream develop
+    git checkout develop
     dev_commit=`git log -1|head -1|awk '{print $2}'`
     dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
     url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
     if [ "$url_return" == '200' ];then
-        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+        cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist
     else
         git checkout -b develop_base_pr upstream/$BRANCH
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         generate_api_spec "$1" "DEV"
         mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
     fi
+
 }
 
 function build_develop() {
-- 
GitLab


From a6947991d82f3b79840bb39f22f4bd3c65d036e8 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 4 Mar 2022 10:58:05 +0800
Subject: [PATCH 111/272] Generate forward-only operators (#39962)

* [Eager][Yaml]Supported Scalar and ScalarArray for AutoCodeGen

* Generate forward-only operators

* [Yaml]Support parsing fwd & bwd returns with name

* Fixed issues

* Fixed minor issues
---
 .../final_state_generator/eager_gen.py        | 10 ++--
 .../final_state_generator/python_c_gen.py     | 57 ++++++++++++-------
 paddle/fluid/pybind/eager_utils.cc            | 47 +++++++++++++--
 paddle/fluid/pybind/eager_utils.h             | 10 ++++
 4 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 4945a6fb654..7de7747ebf0 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -26,14 +26,14 @@ core_ops_args_type_info = {}
 
 yaml_types_mapping = {
     'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
-  'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
-  'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \
-  'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
+    'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+    'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
     'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
-    'Scalar' : 'Scalar',
-    'ScalarArray' : 'ScalarArray'
+    'Scalar' : 'paddle::experimental::Scalar',
+    'ScalarArray' : 'paddle::experimental::ScalarArray'
 }
 
 
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 9c4e102ca45..d0506e45eb4 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -16,20 +16,26 @@ import os
 import argparse
 from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
+skipped_fwd_api_names = set(["scale"])
+
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
     "int": "CastPyArg2Int",
     "long": "CastPyArg2Long",
+    "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
     "string": "CastPyArg2String",
-    "bool[]": "CastPyArg2Booleans",
-    "int[]": "CastPyArg2Ints",
-    "long[]": "CastPyArg2Longs",
-    "float[]": "CastPyArg2Floats",
-    "double[]": "CastPyArg2Float64s",
-    "string[]": "CastPyArg2Strings",
-    "Scalar": "CastPyArg2Scalar",
-    "ScalarArray": "CastPyArg2ScalarArray"
+    "std::vector<bool>": "CastPyArg2Booleans",
+    "std::vector<int>": "CastPyArg2Ints",
+    "std::vector<long>": "CastPyArg2Longs",
+    "std::vector<int64_t>": "CastPyArg2Longs",
+    "std::vector<float>": "CastPyArg2Floats",
+    "std::vector<double>": "CastPyArg2Float64s",
+    "std::vector<std::string>": "CastPyArg2Strings",
+    "paddle::experimental::Scalar": "CastPyArg2Scalar",
+    "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
+    "paddle::experimental::Backend": "CastPyArg2Backend",
+    "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
 
@@ -43,15 +49,9 @@ def ParseArguments():
     return args
 
 
-def GetCxxType(atype):
-    if atype not in yaml_types_mapping.keys():
-        assert False
-
-    return yaml_types_mapping[atype]
-
-
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
+        print(f"Unable to find {atype} in atype_to_parsing_function.")
         assert False
 
     return atype_to_parsing_function[atype]
@@ -59,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype):
 
 def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
                             forward_attrs_list, forward_outputs_position_map,
-                            optional_inputs):
+                            optional_inputs, is_forward_only):
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
     # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
@@ -86,11 +86,10 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
     # Get Attributes
     for name, atype, _, pos in forward_attrs_list:
         parsing_function = FindParsingFunctionFromAttributeType(atype)
-        cxx_type = GetCxxType(atype)
         key = f"{name}"
 
         parse_attributes_str += f"    PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n"
-        parse_attributes_str += f"    {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
+        parse_attributes_str += f"    {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
 
         dygraph_function_call_list[pos] = f"{name}"
     dygraph_function_call_str = ",".join(dygraph_function_call_list)
@@ -127,9 +126,14 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
 }}
 
 """
+    if is_forward_only:
+        fwd_function_name = fwd_api_name
+    else:
+        fwd_function_name = GetForwardFunctionName(fwd_api_name)
+
     python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
         fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
-        GetForwardFunctionName(fwd_api_name), dygraph_function_call_str)
+        fwd_function_name, dygraph_function_call_str)
 
     python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
@@ -213,6 +217,11 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
 #pragma once
 
 #include  "pybind11/detail/common.h"
+#include  "paddle/phi/api/all.h"
+#include  "paddle/phi/common/backend.h"
+#include  "paddle/phi/common/data_type.h"
+#include  "paddle/phi/common/scalar.h"
+#include  "paddle/phi/common/scalar_array.h"
 #include  "paddle/fluid/pybind/op_function_common.h"
 #include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include  "paddle/fluid/pybind/exception.h"
@@ -251,19 +260,23 @@ if __name__ == "__main__":
     python_c_function_list = []
     python_c_function_reg_list = []
     for fwd_api in fwd_api_list:
+
         # We only generate Ops with grad
+        is_forward_only = False
         if 'backward' not in fwd_api.keys():
-            continue
+            is_forward_only = True
 
         assert 'api' in fwd_api.keys()
         assert 'args' in fwd_api.keys()
         assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
 
         fwd_api_name = fwd_api['api']
         fwd_args_str = fwd_api['args']
         fwd_returns_str = fwd_api['output']
 
+        if fwd_api_name in skipped_fwd_api_names:
+            continue
+
         # Parse Dispensable Inputs
         optional_inputs = []
         if 'optional' in fwd_api.keys():
@@ -285,7 +298,7 @@ if __name__ == "__main__":
 
         python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
             fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map, optional_inputs)
+            forward_outputs_position_map, optional_inputs, is_forward_only)
         python_c_function_list.append(python_c_function_str)
         python_c_function_reg_list.append(python_c_function_reg_str)
         print("Generated Python-C Function: ", python_c_function_str)
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 7647930ef07..0cfb08345b6 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -757,7 +757,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
   if (obj == Py_None) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
-        "bool, but got %s",
+        "int, float, bool or Tensor, but got %s",
         op_type, arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
   }
@@ -784,7 +784,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
-        "bool, but got %s",
+        "int, float, bool or Tensor, but got %s",
         op_type, arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
   }
@@ -801,7 +801,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray(
   if (obj == Py_None) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
-        "bool, but got %s",
+        "list or Tensor, but got %s",
         op_type, arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
   }
@@ -821,7 +821,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray(
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
-        "bool, but got %s",
+        "list or Tensor, but got %s",
         op_type, arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
   }
@@ -830,5 +830,44 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray(
   return paddle::experimental::ScalarArray({1});
 }
 
+paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
+                                                const std::string& op_type,
+                                                ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int or place, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "int") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return static_cast<paddle::experimental::Backend>(value);
+  } else {
+    platform::Place place = CastPyArg2Place(obj, arg_pos);
+    return phi::TransToPhiBackend(place);
+  }
+
+  return paddle::experimental::Backend::CPU;
+}
+
+paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
+                                                  const std::string& op_type,
+                                                  ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "data_type, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
+  return framework::TransToPhiDataType(type);
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 6e990691776..c5da1bb37af 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -11,6 +11,8 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -100,6 +102,14 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
 paddle::experimental::ScalarArray CastPyArg2ScalarArray(
     PyObject* obj, const std::string& op_type, ssize_t arg_pos);
 
+paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
+                                                const std::string& op_type,
+                                                ssize_t arg_pos);
+
+paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
+                                                  const std::string& op_type,
+                                                  ssize_t arg_pos);
+
 paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
-- 
GitLab


From 14e98a0fb0ff3aeb36b3061d55d70d4b71f95d79 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Fri, 4 Mar 2022 11:51:30 +0800
Subject: [PATCH 112/272] fix warning (#40133)

---
 paddle/fluid/platform/profiler/profiler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 35dbc96874d..46cbb3358c6 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -95,7 +95,7 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
       collector.ThreadNames();
   for (const auto& kv : thread_names) {
     extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
-                           kv.second);
+                           std::string("%s"), kv.second.c_str());
   }
   return std::unique_ptr<ProfilerResult>(
       new platform::ProfilerResult(std::move(tree), extrainfo));
-- 
GitLab


From abacc4cb1275abd5e942db3a849fcd0d83f9f9f8 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Fri, 4 Mar 2022 11:52:22 +0800
Subject: [PATCH 113/272] transfer selu infershape (#40137)

---
 paddle/fluid/operators/selu_op.cc | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 88ef1f3ea4a..0372a79b967 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel {
          const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    return UnaryOpUnchangedInferShape(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -121,7 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
                   ops::SeluGradMaker<paddle::framework::OpDesc>,
-                  ops::SeluGradMaker<paddle::imperative::OpBase>);
+                  ops::SeluGradMaker<paddle::imperative::OpBase>,
+                  SeluInferShapeFunctor);
+
 REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
-- 
GitLab


From 3ac9bc9521a7c0914bdaa1c8b27014153a001f03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 4 Mar 2022 12:33:10 +0800
Subject: [PATCH 114/272] [infrt] add ir for convert pd dilect to phi dialect.
 test=develop (#40104)

---
 paddle/infrt/dialect/infrt/infrt_ops.td       |   7 ++
 paddle/infrt/dialect/phi/CMakeLists.txt       |   3 +
 paddle/infrt/dialect/phi/ir/infrt_phi_base.td |   1 +
 .../infrt/dialect/phi/pass/kernel_op_desc.cc  |  63 ++++++++++-
 .../infrt/dialect/phi/pass/kernel_op_desc.h   |   4 +
 .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc | 100 ++++++++++++++++--
 .../dialect/phi/pass/proto_arg_map_context.h  |   2 +-
 paddle/infrt/dialect/phi/phi_exec.cc          |  67 +++++++-----
 paddle/infrt/dialect/phi/phi_ir_exec.cc       |  47 ++++++++
 paddle/infrt/host_context/CMakeLists.txt      |   3 +-
 paddle/infrt/pass/CMakeLists.txt              |   1 -
 paddle/infrt/tests/CMakeLists.txt             |   2 +-
 .../infrt/tests/dialect/pten/pten_pass.mlir   |   2 +-
 paddle/infrt/tests/lit.cfg.py.in              |   3 +-
 paddle/scripts/infrt_build.sh                 |   2 +-
 tools/infrt/fake_models/multi_fc.py           |   1 -
 tools/infrt/generate_phi_kernel_dialect.py    |   5 +-
 17 files changed, 263 insertions(+), 50 deletions(-)
 create mode 100644 paddle/infrt/dialect/phi/phi_ir_exec.cc
 delete mode 100755 paddle/infrt/pass/CMakeLists.txt

diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index 00f94805c7d..ecd7093e72b 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -17,3 +17,10 @@ def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
                        OptionalAttr<DictionaryAttr>:$attrs);
   let results = (outs Variadic<AnyType>);
 }
+
+def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
+  let summary = "convert tensor type op";
+  let description = [{convert tensor type op!}];
+  let arguments = (ins AnyType:$input);
+  let results = (outs AnyType:$output);
+}
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index d477b6b9bdc..a2677a946cb 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -5,5 +5,8 @@ endif()
 add_subdirectory(ir)
 add_subdirectory(pass)
 
+add_executable(phi-ir-exec phi_ir_exec.cc)
+target_link_libraries(phi-ir-exec infrt)
+
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
index e9591e7f6d7..671646b9259 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -3,6 +3,7 @@
 
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def PHI_Dialect : Dialect {
   let name = "phi";
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 6c0f6df8921..12a6cfcc3e4 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -16,8 +16,10 @@
 #include <glog/logging.h>
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_registry.h"
-namespace infrt {
+#include "paddle/phi/kernels/declarations.h"
 
+namespace infrt {
+namespace {
 phi::Backend cvtTarget2Phi(TargetType target) {
   switch (target) {
     case TargetType::CPU:
@@ -124,19 +126,76 @@ Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
                cvtLayoutFromPhi(tensor_arg.layout));
 }
 
+}  // namespace
+
+std::string getPhiTargetPrefix(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return "phi_cpu.";
+    case TargetType::GPU:
+      return "phi_gpu.";
+    default:
+      LOG(FATAL) << "UnSupported target type !";
+      return std::string();
+  }
+}
+std::string getPhiPrecisionSuffix(PrecisionType precision) {
+  switch (precision) {
+    case PrecisionType::FLOAT32:
+      return ".float32";
+    case PrecisionType::FLOAT16:
+      return ".float16";
+    case PrecisionType::FLOAT64:
+      return ".float64";
+    case PrecisionType::UINT8:
+      return ".uint8";
+    case PrecisionType::INT8:
+      return ".int8";
+    case PrecisionType::INT16:
+      return ".int16";
+    case PrecisionType::INT32:
+      return ".int32";
+    case PrecisionType::INT64:
+      return ".int64";
+    case PrecisionType::COMPLEX64:
+      return ".complex64";
+    case PrecisionType::COMPLEX128:
+      return ".complex128";
+    case PrecisionType::BOOL:
+      return ".bool";
+    default:
+      LOG(FATAL) << "UnSupported precision type !";
+      return std::string();
+  }
+}
+std::string getPhiLayoutSuffix(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return ".nchw";
+    case LayoutType::NHWC:
+      return ".nhwc";
+    case LayoutType::ANY:
+      return ".any";
+    default:
+      LOG(FATAL) << "UnSupported layout type !";
+      return std::string();
+  }
+}
+
 std::vector<PhiKernelDesc> getCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces) {
   std::vector<PhiKernelDesc> candidate_kernels;
   PhiKernelDesc phi_kernel_desc;
   phi::KernelKeyMap kernel_key_map =
       phi::KernelFactory::Instance().SelectKernelMap(name);
-  for (const Place& place : valid_palces) {
+  for (Place place : valid_palces) {
     phi::KernelKey kernel_key = cvtPlace2Phi(place);
     if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
       kernel_key = phi::KernelKey(kernel_key.backend(),
                                   phi::DataLayout::ALL_LAYOUT,
                                   kernel_key.dtype());
       if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
+      place.layout = LayoutType::ANY;
     }
     phi_kernel_desc.kernelType = place;
     phi_kernel_desc.inputsType.clear();
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index b74107f674e..34fd2f0f62d 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -26,6 +26,10 @@ struct PhiKernelDesc {
   Place kernelType;                // kernel place
 };
 
+std::string getPhiTargetPrefix(TargetType target);
+std::string getPhiPrecisionSuffix(PrecisionType precision);
+std::string getPhiLayoutSuffix(LayoutType layout);
+
 std::vector<PhiKernelDesc> getCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces);
 
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
index df3472aa01d..376ab31938a 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -18,11 +18,14 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OperationSupport.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
@@ -58,8 +61,8 @@ void phiOpCvtPass::convertStage() {
       continue;
     }
 
-    phi::KernelSignature kernel_sign =
-        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+    ::phi::KernelSignature kernel_sign =
+        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
             ProtoArgumentMappingContext(op));
     // resort input&output according to kernel_sign
     ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
@@ -104,13 +107,92 @@ void phiOpCvtPass::diapatchStage() {
     infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
     if (nullptr != kernel_op) worklist.push_back(kernel_op);
   }
-  // ToDo: implementation in the next PR
-  while (!worklist.empty()) {
-    // infrt::KernelOp kernel_op = worklist.back();
-    worklist.pop_back();
-    // std::string kernel_name = kernel_op.name().str();
-    // std::vector<PhiKernelDesc> candidates =
-    //     getCandidateKernels(kernel_name, valid_places_);
+
+  mlir::OpBuilder builder(&block, block.begin());
+  std::map<TargetType, mlir::Value> phi_context;
+  for (infrt::KernelOp kernel_op : worklist) {
+    std::string kernel_name = kernel_op.name().str();
+    std::vector<PhiKernelDesc> candidates =
+        getCandidateKernels(kernel_name, valid_places_);
+    if (candidates.empty()) {
+      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
+      continue;
+    }
+    builder.setInsertionPoint(kernel_op);
+
+    // Todo: Implimentation the concrete pass pick strategy
+    const PhiKernelDesc &phi_kernel_desc = candidates.front();
+
+    kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
+                  kernel_name +
+                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout) +
+                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision);
+
+    // mlir::OperationName operation_name = kernel_op.getOperation()->getName();
+
+    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
+    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
+
+    if (phi_context.find(phi_kernel_desc.kernelType.target) ==
+        phi_context.end()) {
+      switch (phi_kernel_desc.kernelType.target) {
+        case TargetType::CPU: {
+          auto alloctor_value =
+              builder
+                  .create<infrt::phi::CreateAllocatorOp_cpu>(
+                      kernel_op.getLoc(),
+                      phi::AllocatorType::get(kernel_op.getContext(),
+                                              TargetType::CPU))
+                  .output();
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateContextOp_cpu>(
+                      kernel_op.getLoc(),
+                      phi::ContextType::get(kernel_op.getContext(),
+                                            TargetType::CPU),
+                      alloctor_value)
+                  .output();
+          phi_context[TargetType::CPU] = context_value;
+        } break;
+        case TargetType::GPU:
+        case TargetType::UNK:
+        default:
+          LOG(FATAL) << "Unsupported TargetType";
+          break;
+      }
+    }
+    operation_state.addOperands(
+        phi_context.at(phi_kernel_desc.kernelType.target));
+    for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) {
+      mlir::Value input = kernel_op.getOperand(index);
+      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
+          kernel_op.getLoc(),
+          DenseTensorType::get(kernel_op.getContext(),
+                               phi_kernel_desc.inputsType[index].target,
+                               phi_kernel_desc.inputsType[index].precision,
+                               phi_kernel_desc.inputsType[index].layout),
+          input);
+      operation_state.addOperands(cvt_tensor_type_op.output());
+    }
+    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
+         ++index) {
+      operation_state.addTypes(
+          DenseTensorType::get(kernel_op.getContext(),
+                               phi_kernel_desc.outputsType[index].target,
+                               phi_kernel_desc.outputsType[index].precision,
+                               phi_kernel_desc.outputsType[index].layout));
+    }
+    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
+    mlir::Operation *phi_operation = builder.createOperation(operation_state);
+    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
+         ++index) {
+      mlir::Value input = phi_operation->getResult(index);
+      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
+          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
+      kernel_op.getResult(index).replaceAllUsesWith(
+          cvt_tensor_type_op.output());
+    }
+    kernel_op.erase();
   }
 }
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index ca8a22a7e75..e4e9b5c3ff8 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
-class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
+class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
  public:
   // only support op in pd dialect
   explicit ProtoArgumentMappingContext(mlir::Operation* op)
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
index 4e99661a6a2..a2808a00cb6 100644
--- a/paddle/infrt/dialect/phi/phi_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,37 +11,46 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <llvm/Support/CommandLine.h>
-#include <mlir/Pass/PassManager.h>
-#include <iostream>
-#include <string>
-#include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 
-int main(int argc, char** argv) {
-  static llvm::cl::opt<std::string> input_file(
-      llvm::cl::Positional,
-      llvm::cl::desc("Specify input filename"),
-      llvm::cl::init("-"));
-
-  llvm::cl::ParseCommandLineOptions(argc, argv);
+#include "paddle/infrt/host_context/paddle_mlir.h"
 
-  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
-  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+void print_usage() {
+  std::cout << "Error inputs format, two kinds of inputs are supported:\n";
+  std::cout << "    [1] ./paddle-mlir-convert $path_to_model_file "
+               "$path_to_params_file\n";
+  std::cout << "    [2] ./paddle-mlir-convert $path_to_model_dir(__model__ + "
+               "params)\n";
+}
 
-  module->dump();
-  mlir::PassManager pm(context);
+bool parse_inputs(int argc,
+                  char** argv,
+                  std::string* model_file_name,
+                  std::string* params_file_name) {
+  switch (argc) {
+    case 1: {
+      print_usage();
+      return false;
+    }
+    case 2: {
+      *model_file_name = std::string(argv[1]) + std::string("/__model__");
+      *params_file_name = std::string(argv[1]) + std::string("/params");
+      return true;
+    }
+    case 3: {
+      *model_file_name = argv[1];
+      *params_file_name = argv[2];
+      return true;
+    }
+    default: { return false; }
+  }
+}
 
-  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
-  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
-                                             infrt::PrecisionType::FLOAT32,
-                                             infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
-  if (mlir::failed(pm.run(*module))) {
-    std::cout << "\npass failed!\n" << std::endl;
-    return 4;
+int main(int argc, char** argv) {
+  std::string model_file_name;
+  std::string params_file_name;
+  if (parse_inputs(argc, argv, &model_file_name, &params_file_name)) {
+    MLIRModelGenImpl myGen;
+    auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+    module_.dump();
   }
-  module->dump();
-  return 0;
 }
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
new file mode 100644
index 00000000000..1df929895b1
--- /dev/null
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+int main(int argc, char** argv) {
+  static llvm::cl::opt<std::string> input_file(
+      llvm::cl::Positional,
+      llvm::cl::desc("Specify input filename"),
+      llvm::cl::init("-"));
+
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+  context->loadAllAvailableDialects();
+  module->dump();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+  module->dump();
+  return 0;
+}
diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt
index 11304742ecd..14cbea70ca8 100644
--- a/paddle/infrt/host_context/CMakeLists.txt
+++ b/paddle/infrt/host_context/CMakeLists.txt
@@ -12,6 +12,7 @@ gather_srcs(infrt_src SRCS
     function.cc
     mlir_function_executable.cc
     mlir_program_executor.cc
+    paddle_mlir.cc
     )
 
 cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS})
@@ -21,7 +22,7 @@ cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${ML
 cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS})
 cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
-add_executable(paddle-mlir-convert paddle_mlir.cc paddle_mlir_converter.cc)
+add_executable(paddle-mlir-convert paddle_mlir_converter.cc)
 target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS})
 add_executable(infrtexec mlir_exec.cc)
 target_link_libraries(infrtexec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt
deleted file mode 100755
index 51fecdf9077..00000000000
--- a/paddle/infrt/pass/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(phi)
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index e5cc1ec1121..5ce6d867342 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
-    DEPENDS infrtopt infrtexec)
+    DEPENDS infrtopt infrtexec phi-ir-exec)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/pten/pten_pass.mlir
index 30ff2636ae5..61a66cb3d71 100644
--- a/paddle/infrt/tests/dialect/pten/pten_pass.mlir
+++ b/paddle/infrt/tests/dialect/pten/pten_pass.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt %s | FileCheck %s
+// RUN: phi-ir-exec %s
 // CHECK-LABEL: @ops
 func @ops() {
   %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in
index d47957dac92..fe35dc4b8b3 100644
--- a/paddle/infrt/tests/lit.cfg.py.in
+++ b/paddle/infrt/tests/lit.cfg.py.in
@@ -23,9 +23,10 @@ config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib")
 infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/")
 trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/")
 infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/")
+phi_ir_exec_bin = os.path.join(build_dir, "paddle/infrt/dialect/phi")
 
 llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/")
 config.environment['PATH'] = os.path.pathsep.join(
-    (infrtopt_bin, infrtexec_bin, trtexec_bin, llvm_bin, config.environment['PATH']))
+    (infrtopt_bin, infrtexec_bin, trtexec_bin, phi_ir_exec_bin, llvm_bin, config.environment['PATH']))
 
 config.suffixes = ['.mlir']
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 75b27e4165d..fb7be82d1c5 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -92,7 +92,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py
index 03cf6828cc7..0d633cfc60a 100644
--- a/tools/infrt/fake_models/multi_fc.py
+++ b/tools/infrt/fake_models/multi_fc.py
@@ -19,7 +19,6 @@ import sys, os
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.backward import append_backward
 
 size = 2
 num_layers = 4
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 8efa03306fb..f3a78a8d4e8 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -16,7 +16,7 @@ import json
 import sys
 
 attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'}
-supported_kernels = ['sign', 'dot', 'digamma', 'conj']
+supported_kernels = ['sign', 'dot', 'digamma', 'conj', 'abs', 'add_raw']
 
 target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
 layout_type_converter = {
@@ -66,7 +66,8 @@ def generate_attrs_info(op_name, attrs_info):
         'digamma': [],
         'lerp': [],
         'cast': ['out_dtype', 'in_dtype'],
-        'abs': []
+        'abs': [],
+        'add_raw': ['axis'],
     }
     attrs_args_ = ""
     if len(kernel_attrs_names[op_name]) == len(attrs_info):
-- 
GitLab


From e2e2d53142a71ec35d82eb7c7630543572bc531b Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 4 Mar 2022 12:38:58 +0800
Subject: [PATCH 115/272] [phi]move reduce gpu impl funcs into
 pten/kernels/funcs (#39990)

* move reduce gpu impl funcs into pten/kernels/funcs

* change reduce header name and namespace

* fix spell word error

* change mutable_data to dev_ctx.Alloc

* modify place to devcontex

* format code style

* fix build error

* fix build error

* fix conflict
---
 .../fluid/operators/fused/attn_bias_add.cu.h  |    4 +-
 .../reduce_ops/check_reduce_rank_test.cu      |    4 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h |    5 +-
 paddle/phi/kernels/funcs/reduce_function.h    | 1240 +++++++++++++++++
 .../gpu/broadcast_tensors_grad_kernel.cu      |    4 +-
 paddle/phi/kernels/gpu/compare_kernel.cu      |    2 +-
 paddle/phi/kernels/gpu/elementwise_grad.h     |   10 +-
 paddle/phi/kernels/gpu/reduce.h               | 1234 +---------------
 paddle/phi/kernels/gpu/trace_kernel.cu        |    4 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |    2 +-
 10 files changed, 1264 insertions(+), 1245 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/reduce_function.h

diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 20801d2243f..51cf3bce1ce 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
 
   int num_block = (max_threads / left_num);
   if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-    *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block);
+    *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block);
     if (*blocking_size <= 1) {
-      *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num));
+      *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num));
     } else if (*blocking_size * 2 < reduce_num) {
       *blocking_size *= 2;
     }
diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
index e8e4ff7010d..a724524716b 100644
--- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
+++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
@@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) {
       }
 
       if (is_valid) {
-        phi::kernels::details::CheckReduceRank(reduce_rank, rank);
+        phi::funcs::details::CheckReduceRank(reduce_rank, rank);
       } else {
-        ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank),
+        ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank),
                      paddle::platform::EnforceNotMet);
       }
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 3aab906804f..eb76eee1048 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -23,8 +23,7 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 namespace paddle {
 namespace operators {
 
@@ -37,7 +36,7 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
                       gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
-  phi::kernels::TensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+  phi::funcs::TensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
       static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
       origin_reduce_dims, stream);
 }
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
new file mode 100644
index 00000000000..7df772682ec
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -0,0 +1,1240 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+#include "paddle/utils/string/string_helper.h"
+
+// Reduce split or not, Whether to use ReduceHigherDim
+#define REDUCE_SPLIT_BOUNDARY 512
+#define REDUCE_VEC_SIZE 4
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+
+namespace details {
+
+static inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
+
+// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
+static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
+                                             const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+// get blockDim for reduceLastDim and reduceAny
+static inline int GetBlockDim(int block_dim) {
+  return block_dim >= kps::details::kReduceMaxThread
+             ? kps::details::kReduceMaxThread
+             : GetLastPow2(block_dim);
+}
+
+// check reduce rand is valid
+static inline void CheckReduceRank(int reduce_rank, int rank) {
+  if (rank % 2 == 0) {
+    PADDLE_ENFORCE_EQ(reduce_rank,
+                      rank / 2,
+                      phi::errors::InvalidArgument(
+                          "ReduceOp: invalid reduce rank. When rank = %d, "
+                          "reduce_rank must be %d, but got %d.",
+                          rank,
+                          rank / 2,
+                          reduce_rank));
+  } else {
+    auto lower_rank = (rank - 1) / 2;
+    auto upper_rank = (rank + 1) / 2;
+    PADDLE_ENFORCE_EQ(
+        reduce_rank == lower_rank || reduce_rank == upper_rank,
+        true,
+        phi::errors::InvalidArgument(
+            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
+            "must be %d or %d, but got %d.",
+            rank,
+            lower_rank,
+            upper_rank,
+            reduce_rank));
+  }
+}
+
+// convert dims from vector to array
+template <typename T, size_t ElementCount, typename VectorLikeType>
+static inline phi::Array<T, ElementCount> VectorToArray(
+    const VectorLikeType& vec) {
+  PADDLE_ENFORCE_LE(
+      vec.size(),
+      ElementCount,
+      phi::errors::InvalidArgument("Cub reduce Array: size not match. Received "
+                                   "vec.size() %d > ElementCount %d.",
+                                   vec.size(),
+                                   ElementCount));
+  size_t n = static_cast<size_t>(vec.size());
+  phi::Array<T, ElementCount> ret;
+  for (size_t i = 0; i < n; ++i) {
+    ret[i] = vec[i];
+  }
+  return ret;
+}
+
+static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
+                                            int dim_size,
+                                            bool reduce_all) {
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    reduce_dims.resize(dim_size);
+    int reduce_size = reduce_dims.size();
+    for (int i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = i;
+    }
+  } else {
+    for (auto e : dims) {
+      PADDLE_ENFORCE_LT(e,
+                        dim_size,
+                        phi::errors::InvalidArgument(
+                            "ReduceOp: invalid axis, when x_dims is %d, "
+                            "axis[i] should less than x_dims, but got %d.",
+                            dim_size,
+                            e));
+      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
+    }
+  }
+  return reduce_dims;
+}
+
+}  // namespace details
+
+constexpr int kMaxRank = phi::DDim::kMaxRank;
+
+enum ReduceType {
+  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
+  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
+  kReduceAny = 0x03,        // when reduce_dim.size() > 1
+};
+
+struct IndexCalculator {
+  IndexCalculator(int dim,
+                  const std::vector<int>& cal_dims,
+                  const std::vector<int>& cal_strides,
+                  const std::vector<int>& full_strides)
+      : dim(dim) {
+    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
+    strides = details::VectorToArray<int, kMaxRank>(full_strides);
+    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
+#ifndef PADDLE_WITH_XPU_KP
+    std::vector<paddle::platform::FastDivMod> cal_divmoders;
+    // fast divmod
+    for (auto i : cal_strides) {
+      cal_divmoders.push_back(paddle::platform::FastDivMod(i));
+    }
+    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
+        cal_divmoders);
+#endif
+  }
+
+  __device__ inline int operator()(int offset) const {
+#ifdef PADDLE_WITH_XPU_KP
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      index += (offset / reduce_strides[i]) * strides[dims[i]];
+      offset = offset % reduce_strides[i];
+    }
+    return index;
+#else
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      auto divmod = divmoders[i].Divmod(offset);
+      index += (divmod.val[0] * strides[dims[i]]);
+      offset = divmod.val[1];
+    }
+    return index;
+#endif
+  }
+
+  int dim;
+  phi::Array<int, kMaxRank> dims;
+  phi::Array<int, kMaxRank> strides;
+  phi::Array<int, kMaxRank> reduce_strides;
+#ifndef PADDLE_WITH_XPU2
+  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+#endif
+};
+
+template <bool ReduceLastDim = false>
+struct ReduceIndexMapping {
+  const kps::DimConfig dim;
+  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
+      : dim(dims) {}
+
+  __device__ __forceinline__ int BlockIdX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    } else {
+      return cluster_id() % dim.split_num_x;
+    }
+#else
+    return blockIdx.x;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockIdY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() % dim.split_num_x);
+    } else {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    }
+#else
+    return blockIdx.y;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockDimX() {
+#ifdef PADDLE_WITH_XPU2
+    return dim.deal_size_x;
+#else
+    return blockDim.x;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockDimY() {
+#ifdef PADDLE_WITH_XPU2
+    return 1;
+#else
+    return blockDim.y;
+#endif
+  }
+
+  __device__ __forceinline__ int GridDimX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_y;
+    } else {
+      return dim.split_num_x;
+    }
+#else
+    return gridDim.x;
+#endif
+  }
+
+  __device__ __forceinline__ int GridDimY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_x;
+    } else {
+      return dim.split_num_y;
+    }
+#else
+    return gridDim.y;
+#endif
+  }
+
+  __device__ __forceinline__ int GetLoopSize() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.deal_size_y;
+    } else {
+      return dim.deal_size_x;
+    }
+#else
+    return 1;
+#endif
+  }
+};
+
+// when reduce_type == kReduceLastDim this struct will be used
+// for higher performance
+struct OneDimIndexCal {
+  explicit OneDimIndexCal(int num) : stride(num) {}
+
+  __device__ inline int operator()(int index) const { return index * stride; }
+  int stride;
+};
+
+// reduce config
+template <typename Ty>
+struct ReduceConfig {
+  ReduceConfig(const std::vector<int>& origin_reduce_dims,
+               const std::vector<int>& origin_x_dim)
+      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
+
+  // get the parameters of reduceKernel
+  void Run() {
+    // step1: update the reduce_dim left_dim and x_dim
+    SetReduceDim();
+
+    // step2: get the strides of dim for reduceAny and reduceLastDim
+    SetStrides();
+
+    // step3: get the type of reduce
+    SetReduceType();
+
+    // step4: set the block and grid for launch kernel
+    SetBlockDim();
+  }
+
+  // when should_reduce_again is true, we need malloc temp space for temp data
+  void SetOutputData(Ty* y_data,
+                     const phi::GPUContext& dev_ctx,
+                     phi::DenseTensor* tmp) {
+    if (should_reduce_again) {
+      tmp->ResizeAndAllocate(phi::make_ddim(
+          {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
+
+      output_data = dev_ctx.Alloc<Ty>(tmp);
+    } else {
+      output_data = y_data;
+    }
+  }
+
+ private:
+  // set reduce_dim, left_dim and update x_dim
+  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
+  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
+  void SetReduceDim() {
+    std::set<int> reduce_set;
+    for (auto e : reduce_dims_origin) {
+      auto pos = e >= 0 ? e : e + x_dim.size();
+      reduce_set.insert(pos);
+    }
+
+    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
+    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
+
+    // update reduce_dim and x_dim
+    std::vector<int> x_new_dim;
+
+    reduce_dim.push_back(reduce_dim_temp[0]);
+    x_new_dim.push_back(x_dim[0]);
+
+    int idx_reduce = 1;
+    int num = 0;
+
+    if (reduce_dim_temp.size() > 1) {
+      for (int i = 1; i < x_dim.size(); i++) {
+        if ((idx_reduce < reduce_dim_temp.size()) &&
+            (i == reduce_dim_temp[idx_reduce])) {
+          int result =
+              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
+          bool is_equal = ((result - num) == 1);
+          if (is_equal) {
+            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+            num++;
+          } else {
+            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
+            x_new_dim.push_back(x_dim[i]);
+          }
+          idx_reduce++;
+        } else {
+          x_new_dim.push_back(x_dim[i]);
+        }
+      }
+    } else {
+      x_new_dim = x_dim;
+    }
+
+    // update x_dim
+    x_dim = x_new_dim;
+    std::vector<int>().swap(x_new_dim);
+
+    std::vector<int> reduce_dim_new;
+    int is_reduced = 0;
+    for (auto e : reduce_dim) {
+      is_reduced |= 1 << e;
+    }
+
+    std::vector<int>().swap(reduce_dim);
+
+    for (int i = 0; i < x_dim.size(); i++) {
+      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+        x_new_dim.push_back(x_dim[i]);
+        if ((is_reduced >> i) & 1)
+          reduce_dim_new.push_back(x_new_dim.size() - 1);
+      } else {
+        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+      }
+    }
+
+    x_dim = x_new_dim;
+    reduce_dim = reduce_dim_new;
+
+    int x_rank = static_cast<int>(x_dim.size());
+    std::set<int> left_set;
+
+    for (int i = 0; i < x_rank; ++i) {
+      left_set.insert(i);
+    }
+
+    for (auto e : reduce_dim) {
+      left_set.erase(e);
+    }
+
+    left_dim.assign(left_set.begin(), left_set.end());
+
+    // if the last dim gets involved in reduction
+    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
+  }
+
+  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
+  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
+  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
+  //     left_strides = [1]
+  void SetStrides() {
+    std::vector<int> idx_dim;
+    for (int i = 0; i < x_dim.size(); i++) {
+      idx_dim.push_back(i);
+    }
+
+    x_strides = details::GetDimStrides(x_dim, idx_dim);
+    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
+    left_strides = details::GetDimStrides(x_dim, left_dim);
+    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+
+    left_num = 1;
+    if (left_dim.size()) {
+      left_num = left_strides[0] * x_dim[left_dim[0]];
+    }
+  }
+
+  // get the reduceType
+  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
+  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
+  //     x_dim = [8] reduce_dim = [0] --> reduceAll
+  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
+  void SetReduceType() {
+    int rank = x_dim.size();
+    int reduce_rank = reduce_dim.size();
+    bool is_last_dim =
+        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
+    if (rank == reduce_rank || is_last_dim) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
+      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
+    } else if (reduce_rank == 1) {
+// ReduceFirstDim and reduceSecondDim
+#ifdef PADDLE_WITH_XPU_KP
+      if (reduce_dim[0] == 0) {
+        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+      } else {
+        reduce_type = static_cast<int>(ReduceType::kReduceAny);
+      }
+#else
+      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+#endif
+    } else {
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+    }
+  }
+
+#ifndef PADDLE_WITH_XPU_KP
+  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
+    constexpr int min_reduce_num_per_thread = 16;
+    constexpr int max_reduce_num_per_thread = 256;
+    constexpr int max_num_threads = kps::details::kReduceMaxThread;
+
+    // set block size.
+    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
+    //    will process the reduction for one output.
+    //    The number of output for one block is blockDim.y;
+    // 2. If reduce_last_dim == false, different threadIdx.x will process
+    //    different reduction and gets the output separately. If it is
+    //    necessary, it should reduce in block y.
+    //    The number of output for one block is blockDim.x;
+    int block_x, block_y;
+    int grid_num, reduce_num_per_thread;
+    if (reduce_last_dim) {
+      block_x = details::GetBlockDim(reduce_num);
+      block_y = details::GetBlockDim(left_num);
+      block_dim->x = block_x;
+      block_dim->y =
+          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
+      grid_num = details::AlignUp(left_num, block_dim->y);
+      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
+    } else {
+      block_x = details::GetBlockDim(left_num);
+      block_y = details::GetBlockDim(reduce_num);
+      block_dim->x = std::min(block_x, 32);
+      block_dim->y =
+          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
+      block_dim->x =
+          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
+      grid_num = details::AlignUp(left_num, block_dim->x);
+      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
+    }
+    int device_id = paddle::platform::GetCurrentDeviceId();
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int max_threads_per_mp =
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+    int max_threads = max_threads_per_mp * max_mp;
+    int num_threads = block_dim->x * block_dim->y;
+    int max_num_blocks = max_threads / num_threads;
+
+    // set grid size.
+    // Whether to set grid.y larger than 1, there are 3 following rules:
+    // 1. The number that each thread process should no less than
+    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
+    // 2. It should maximize the utilization of SM.
+    // So we choose the minimum between input_split_num_1 and input_split_num_3
+    // to make each thread process as mush data as possible. Meanwhile,
+    // the number cannot be larger than max_reduce_num_per_thread, so we
+    // choose the maximum between the result above and input_split_num_2.
+    int input_split_num_1 =
+        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
+    int input_split_num_2 =
+        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
+    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
+
+    grid_dim->x = grid_num;
+    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
+                           input_split_num_2);
+    // if grid.y > 1, we need launch reduce kernel again.
+    if (grid_dim->y > 1) {
+      should_reduce_again = true;
+    }
+  }
+
+  // set block and grid for launch kernel
+  // for ReduceHigherDim: if block is enough -> splite reduce_num
+  //                     else init block(32, 1) grid(block_num, 1)
+  // for others: block(block_num, 1) , grid(left_num, 1)
+  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
+    int last_dim_num = x_dim.back();
+    // update left_num
+    int grid_z = left_num / last_dim_num;
+    left_num = last_dim_num;
+    grid_dim->z = grid_z;
+    int device_id = paddle::platform::GetCurrentDeviceId();
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int max_threads_per_mp =
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+    int max_threads = max_threads_per_mp * max_mp;
+    // init
+    int num_block = (max_threads / left_num);
+    block_dim->x = details::GetBlockDim(left_num);
+    grid_dim->x = details::AlignUp(left_num, block_dim->x);
+    blocking_size = reduce_num;
+
+    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
+      blocking_size = details::GetLastPow2(reduce_num / num_block);
+      if (blocking_size <= 1) {
+        blocking_size = details::GetLastPow2(sqrt(reduce_num));
+      } else if (blocking_size * 2 < reduce_num) {
+        blocking_size *= 2;
+      }
+      should_reduce_again = true;
+      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
+    }
+  }
+#endif
+
+  void SetBlockDim() {
+    // init
+    int block_num = details::GetBlockDim(reduce_num);
+    should_reduce_again = false;
+    dim3 block_dim(block_num, 1, 1);
+    dim3 grid_dim(left_num, 1, 1);
+    blocking_size = reduce_num;
+#ifdef PADDLE_WITH_XPU_KP
+    if (reduce_last_dim) {
+      block_dim.x = 64;
+      block_dim.y = reduce_num;
+      grid_dim.x = 1;
+      grid_dim.y = 8;
+    } else {
+      block_dim.x = 64;
+      block_dim.y = left_num;
+      grid_dim.x = 8;
+      grid_dim.y = 1;
+    }
+#else
+    if (reduce_type == ReduceType::kReduceHigherDim) {
+      SetBlockDimForHigher(&block_dim, &grid_dim);
+    } else {
+      SetBlockDimForReduceAny(&block_dim, &grid_dim);
+    }
+#endif
+
+    block = block_dim;
+    grid = grid_dim;
+  }
+
+ public:
+  std::vector<int> reduce_dims_origin;
+  std::vector<int> reduce_dim;
+  std::vector<int> x_dim;
+  std::vector<int> left_dim;
+  std::vector<int> x_strides;
+  std::vector<int> left_strides;
+  std::vector<int> reduce_strides;
+
+  int reduce_type;
+  int reduce_num;
+  int left_num;
+  int blocking_size;
+  bool should_reduce_again;
+  bool reduce_last_dim;
+
+  Ty* output_data;
+
+  dim3 block;
+  dim3 grid;
+};
+
+// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+// function will be used
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp,
+          typename Calculator>
+__global__ void ReduceAnyKernel(const Tx* x,
+                                Ty* y,
+                                ReduceOp reducer,
+                                TransformOp transformer,
+                                MPType init,
+                                int reduce_num,
+                                int left_num,
+                                bool reduce_last_dim,
+                                const Calculator reduce_index_calculator,
+                                const Calculator left_index_calculator,
+                                const kps::DimConfig dim) {
+  int input_idx, left_idx, stride;
+  int block_size = 0;
+  bool need_store = true;
+  int loop_left = 0;
+  int tid = 0;
+  // the last dim gets involved in reduction
+  int store_offset = 0;
+  int stride_left = 0;
+  if (reduce_last_dim) {
+    auto block = ReduceIndexMapping<true>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimX();
+    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
+    stride = block.GridDimY() * block.BlockDimX();
+    block_size = block.BlockDimX();
+    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = 1;
+    tid = THREAD_ID_X;
+  } else {
+    auto block = ReduceIndexMapping<false>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimY();
+    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
+    stride = block.GridDimY() * block.BlockDimY();
+    block_size = block.BlockDimY();
+    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = block.BlockDimX() * block.GridDimX();
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    tid = THREAD_ID_Y;
+  }
+  // calculate the offset, means the addr where each thread really start.
+  // 1. reduce for each thread
+  MPType input_compute[REDUCE_VEC_SIZE];
+  Tx input_reg[REDUCE_VEC_SIZE];
+  int input_idx_tmp = input_idx;
+  for (int i = 0; i < loop_left; i += stride_left) {
+    int input_offset = left_index_calculator(left_idx + i);
+    const _ptr_ Tx* input = x + input_offset;
+    MPType reduce_var = init;
+    // load REDUCE_VEC_SIZE data once, and then compute
+    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    input_idx = input_idx_tmp;
+    for (; input_idx + block_size < bound;
+         input_idx += REDUCE_VEC_SIZE * stride) {
+      kps::ReadDataReduce<Tx,
+                          Tx,
+                          1,
+                          REDUCE_VEC_SIZE,
+                          1,
+                          1,
+                          Calculator,
+                          kps::IdentityFunctor<Tx>,
+                          false>(&input_reg[0],
+                                 input,
+                                 input_idx,
+                                 reduce_index_calculator,
+                                 1,
+                                 reduce_num,
+                                 1,
+                                 stride,
+                                 kps::IdentityFunctor<Tx>(),
+                                 reduce_last_dim);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &input_compute[0], &input_reg[0], transformer);
+      kps::Reduce<MPType,
+                  REDUCE_VEC_SIZE,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+    }
+
+    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
+    kps::ReadDataReduce<Tx,
+                        MPType,
+                        1,
+                        REDUCE_VEC_SIZE,
+                        1,
+                        1,
+                        Calculator,
+                        TransformOp,
+                        true>(&input_compute[0],
+                              input,
+                              input_idx,
+                              reduce_index_calculator,
+                              1,
+                              reduce_num - input_idx,
+                              1,
+                              stride,
+                              transformer,
+                              reduce_last_dim);
+    kps::Reduce<MPType,
+                REDUCE_VEC_SIZE,
+                1,
+                1,
+                ReduceOp,
+                kps::details::ReduceMode::kLocalMode>(
+        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+
+    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
+        &reduce_var, &reduce_var, reducer, reduce_last_dim);
+    if (need_store) {
+      y[store_offset + i] = static_cast<Ty>(reduce_var);
+    }
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+__global__ void ReduceHigherDimKernel(const Tx* x,
+                                      Ty* y,
+                                      ReduceOp reducer,
+                                      TransformOp transformer,
+                                      MPType init,
+                                      int reduce_num,
+                                      int left_num,
+                                      int blocking_size,
+                                      const kps::DimConfig dim) {
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  auto block = ReduceIndexMapping<false>(dim);
+  int idy = block.BlockIdY() * blocking_size;
+  int idx = block.BlockIdX() * block.BlockDimX();
+  int idz = BLOCK_ID_Z * left_num;
+  int stride = dim.split_num_x * dim.deal_size_x;
+  int size = left_num - dim.rem_x;
+  int loop_size = min(reduce_num - idy, blocking_size);
+  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
+  int block_offset = idy * left_num + idz * reduce_num;
+  const _ptr_ Tx* input = x + block_offset;
+  Tx reduce_input;
+  for (; idx < size; idx += stride) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
+                                            input + loop_idx * left_num + idx,
+                                            block.BlockDimX(),
+                                            1,
+                                            1,
+                                            left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, false>(
+        y + store_offset + idx, &result, block.BlockDimX());
+  }
+
+  if (idx < left_num) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
+                                           input + loop_idx * left_num + idx,
+                                           dim.rem_x,
+                                           1,
+                                           1,
+                                           left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, true>(
+        y + store_offset + idx, &result, dim.rem_x);
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+static void LaunchReduceKernel(const Tx* x_data,
+                               Ty* y_data,
+                               const ReduceOp& reducer,
+                               const TransformOp& transform,
+                               MPType init,
+                               KPStream stream,
+                               ReduceConfig<Ty> config) {
+  if (config.reduce_type == kReduceLastDim) {
+    int stride_reduce = 1;
+    int stride_left = config.reduce_num;
+    // for higher performance
+    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
+    auto left_index_calculator = OneDimIndexCal(stride_left);
+
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.block.y,
+                                        0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#else
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#endif
+
+  } else {
+    int reduce_rank = config.reduce_strides.size();
+    int left_rank = config.left_strides.size();
+    auto reduce_index_calculator = IndexCalculator(reduce_rank,
+                                                   config.reduce_dim,
+                                                   config.reduce_strides,
+                                                   config.x_strides);
+    auto left_index_calculator = IndexCalculator(
+        left_rank, config.left_dim, config.left_strides, config.x_strides);
+
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.block.y,
+                                        0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#else
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#endif
+  }
+
+  if (config.should_reduce_again) {
+    dim3 block;
+    dim3 grid;
+    if (config.reduce_last_dim) {
+      block = dim3(32, 1, 1);
+      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
+    } else {
+      block = dim3(config.block.x, 1, 1);
+      grid = dim3(config.grid.x, 1, config.grid.z);
+    }
+
+    auto last_index = OneDimIndexCal(1);
+    auto first_index = OneDimIndexCal(config.left_num);
+    kps::DimConfig dim =
+        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+    dim.SetRem(config.left_num % block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
+        config.output_data,
+        y_data,
+        reducer,
+        kps::IdentityFunctor<Ty, MPType>(),
+        init,
+        config.grid.y,
+        config.left_num,
+        config.grid.y,
+        dim);
+#else
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+        config.output_data,
+        y_data,
+        reducer,
+        kps::IdentityFunctor<Ty, MPType>(),
+        init,
+        config.grid.y,
+        config.left_num,
+        config.grid.y,
+        dim);
+#endif
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const phi::GPUContext& dev_ctx,
+                    KPStream stream) {
+  auto reducer = ReduceOp<Ty>();
+  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
+                                                                  transform);
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Reduce(nullptr,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+  phi::DenseTensor tmp =
+      phi::Empty<uint8_t>(dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
+
+  auto* temp_storage = dev_ctx.Alloc<uint8_t>(&tmp);
+
+  cub::DeviceReduce::Reduce(temp_storage,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const phi::GPUContext& dev_ctx,
+                    KPStream stream) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+void TensorReduceImpl(const phi::GPUContext& dev_ctx,
+                      const phi::DenseTensor& x,
+                      phi::DenseTensor* y,
+                      const TransformOp& transform,
+                      const std::vector<int>& origin_reduce_dims,
+                      KPStream stream) {
+  dev_ctx.Alloc<Ty>(y);
+
+  auto x_dim = phi::vectorize<int>(x.dims());
+  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
+  config.Run();
+  int numel = x.numel();
+  // after config.run()
+  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
+  // temp_output should be stored temp_data in output_data space or stored in
+  // y_data;
+
+  phi::DDim tmp_ddim;
+  phi::DenseTensor tmp = phi::Empty<Ty>(dev_ctx);
+
+  auto x_data = x.data<Tx>();
+  auto y_data = y->data<Ty>();
+
+  if (config.reduce_num == 1) {
+    std::vector<const DenseTensor*> inputs = {&x};
+    std::vector<DenseTensor*> outputs = {y};
+    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
+    return;
+  }
+
+  config.SetOutputData(y_data, dev_ctx, &tmp);
+  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
+  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+#ifndef PADDLE_WITH_XPU_KP
+  if (use_cub_reduce) {
+    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+        x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
+    return;
+  }
+#endif
+
+  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
+  auto reducer = ReduceOp<MPType>();
+  // launch ReduceHigherDimKernel
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
+  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
+  //     32
+  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
+  if (config.reduce_type == ReduceType::kReduceHigherDim) {
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.blocking_size,
+                                        0);
+    dim.SetRem(config.left_num % config.block.x,
+               config.reduce_num % config.blocking_size,
+               0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<Tx,
+                          Ty,
+                          MPType,
+                          ReduceOp<MPType>,
+                          TransformOp><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
+#else
+    ReduceHigherDimKernel<
+        Tx,
+        Ty,
+        MPType,
+        ReduceOp<MPType>,
+        TransformOp><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
+#endif
+
+    if (config.should_reduce_again) {
+      dim3 block = dim3(config.block.x, 1, 1);
+      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
+      kps::DimConfig dim2 =
+          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+      dim2.SetRem(config.left_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+      ReduceHigherDimKernel<
+          Ty,
+          Ty,
+          MPType,
+          ReduceOp<MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
+          config.output_data,
+          y_data,
+          reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+          reducer.initial(),
+          config.grid.y,
+          config.left_num,
+          config.grid.y,
+          dim2);
+#else
+      ReduceHigherDimKernel<
+          Ty,
+          Ty,
+          MPType,
+          ReduceOp<MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+          config.output_data,
+          y_data,
+          reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+          reducer.initial(),
+          config.grid.y,
+          config.left_num,
+          config.grid.y,
+          dim2);
+#endif
+    }
+    return;
+  }
+
+  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+  // function will be used
+  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
+      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
+}
+
+}  // namespace funcs
+
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 275b8411ccc..926dffc7450 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 namespace phi {
@@ -87,7 +87,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
           *input_tensor, ctx.GetPlace(), ctx, output_tensor);
     } else {
       // reduce_sum implementation on CUDA
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx,
           *input_tensor,
           output_tensor,
diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu
index 272448504ac..9c02627e546 100644
--- a/paddle/phi/kernels/gpu/compare_kernel.cu
+++ b/paddle/phi/kernels/gpu/compare_kernel.cu
@@ -80,7 +80,7 @@ inline void CompareAllKernelImpl(const Context& ctx,
   for (int i = 0; i < reduce_dims.size(); ++i) {
     reduce_dims[i] = i;
   }
-  kernels::TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
+  funcs::TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
       ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims, ctx.stream());
 }
 
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index b17196b6b11..20799f4e37b 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
@@ -84,7 +84,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
       gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
@@ -99,7 +99,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
       gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
@@ -197,7 +197,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
       gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
@@ -218,7 +218,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
       gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
           ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
     }
   }
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 94c2e980e36..0319de7558e 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -17,1229 +17,9 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/utils/array.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-
-// Reduce split or not, Whether to use ReduceHigherDim
-#define REDUCE_SPLIT_BOUNDARY 512
-#define REDUCE_VEC_SIZE 4
-
-namespace kps = phi::kps;
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
-namespace kernels {
-
-namespace details {
-
-static inline int GetLastPow2(int n) {
-  n |= (n >> 1);
-  n |= (n >> 2);
-  n |= (n >> 4);
-  n |= (n >> 8);
-  n |= (n >> 16);
-  return std::max(1, n - (n >> 1));
-}
-
-static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
-
-// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
-static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
-                                             const std::vector<int>& idx) {
-  int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[idx[i + 1]];
-  }
-  return strides;
-}
-
-// get blockDim for reduceLastDim and reduceAny
-static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kps::details::kReduceMaxThread
-             ? kps::details::kReduceMaxThread
-             : GetLastPow2(block_dim);
-}
-
-// check reduce rand is valid
-static inline void CheckReduceRank(int reduce_rank, int rank) {
-  if (rank % 2 == 0) {
-    PADDLE_ENFORCE_EQ(reduce_rank,
-                      rank / 2,
-                      phi::errors::InvalidArgument(
-                          "ReduceOp: invalid reduce rank. When rank = %d, "
-                          "reduce_rank must be %d, but got %d.",
-                          rank,
-                          rank / 2,
-                          reduce_rank));
-  } else {
-    auto lower_rank = (rank - 1) / 2;
-    auto upper_rank = (rank + 1) / 2;
-    PADDLE_ENFORCE_EQ(
-        reduce_rank == lower_rank || reduce_rank == upper_rank,
-        true,
-        phi::errors::InvalidArgument(
-            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
-            "must be %d or %d, but got %d.",
-            rank,
-            lower_rank,
-            upper_rank,
-            reduce_rank));
-  }
-}
-
-// convert dims from vector to array
-template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline phi::Array<T, ElementCount> VectorToArray(
-    const VectorLikeType& vec) {
-  PADDLE_ENFORCE_LE(
-      vec.size(),
-      ElementCount,
-      phi::errors::InvalidArgument("Cub reduce Array: size not match. Received "
-                                   "vec.size() %d > ElementCount %d.",
-                                   vec.size(),
-                                   ElementCount));
-  size_t n = static_cast<size_t>(vec.size());
-  phi::Array<T, ElementCount> ret;
-  for (size_t i = 0; i < n; ++i) {
-    ret[i] = vec[i];
-  }
-  return ret;
-}
-
-static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
-                                            int dim_size,
-                                            bool reduce_all) {
-  std::vector<int> reduce_dims;
-  if (reduce_all) {
-    reduce_dims.resize(dim_size);
-    int reduce_size = reduce_dims.size();
-    for (int i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = i;
-    }
-  } else {
-    for (auto e : dims) {
-      PADDLE_ENFORCE_LT(e,
-                        dim_size,
-                        phi::errors::InvalidArgument(
-                            "ReduceOp: invalid axis, when x_dims is %d, "
-                            "axis[i] should less than x_dims, but got %d.",
-                            dim_size,
-                            e));
-      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
-    }
-  }
-  return reduce_dims;
-}
-
-}  // namespace details
-
-constexpr int kMaxRank = phi::DDim::kMaxRank;
-
-enum ReduceType {
-  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
-  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
-  kReduceAny = 0x03,        // when reduce_dim.size() > 1
-};
-
-struct IndexCalculator {
-  IndexCalculator(int dim,
-                  const std::vector<int>& cal_dims,
-                  const std::vector<int>& cal_strides,
-                  const std::vector<int>& full_strides)
-      : dim(dim) {
-    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
-    strides = details::VectorToArray<int, kMaxRank>(full_strides);
-    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
-#ifndef PADDLE_WITH_XPU_KP
-    std::vector<paddle::platform::FastDivMod> cal_divmoders;
-    // fast divmod
-    for (auto i : cal_strides) {
-      cal_divmoders.push_back(paddle::platform::FastDivMod(i));
-    }
-    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
-        cal_divmoders);
-#endif
-  }
-
-  __device__ inline int operator()(int offset) const {
-#ifdef PADDLE_WITH_XPU_KP
-    int index = 0;
-#pragma unroll
-    for (int i = 0; i < kMaxRank; ++i) {
-      if (i == dim) {
-        break;
-      }
-      index += (offset / reduce_strides[i]) * strides[dims[i]];
-      offset = offset % reduce_strides[i];
-    }
-    return index;
-#else
-    int index = 0;
-#pragma unroll
-    for (int i = 0; i < kMaxRank; ++i) {
-      if (i == dim) {
-        break;
-      }
-      auto divmod = divmoders[i].Divmod(offset);
-      index += (divmod.val[0] * strides[dims[i]]);
-      offset = divmod.val[1];
-    }
-    return index;
-#endif
-  }
-
-  int dim;
-  phi::Array<int, kMaxRank> dims;
-  phi::Array<int, kMaxRank> strides;
-  phi::Array<int, kMaxRank> reduce_strides;
-#ifndef PADDLE_WITH_XPU2
-  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
-#endif
-};
-
-template <bool ReduceLastDim = false>
-struct ReduceIndexMapping {
-  const kps::DimConfig dim;
-  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
-      : dim(dims) {}
-
-  __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    } else {
-      return cluster_id() % dim.split_num_x;
-    }
-#else
-    return blockIdx.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() % dim.split_num_x);
-    } else {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    }
-#else
-    return blockIdx.y;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return 1;
-#else
-    return blockDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_y;
-    } else {
-      return dim.split_num_x;
-    }
-#else
-    return gridDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_x;
-    } else {
-      return dim.split_num_y;
-    }
-#else
-    return gridDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.deal_size_y;
-    } else {
-      return dim.deal_size_x;
-    }
-#else
-    return 1;
-#endif
-  }
-};
-
-// when reduce_type == kReduceLastDim this struct will be used
-// for higher performance
-struct OneDimIndexCal {
-  explicit OneDimIndexCal(int num) : stride(num) {}
-
-  __device__ inline int operator()(int index) const { return index * stride; }
-  int stride;
-};
-
-// reduce config
-template <typename Ty>
-struct ReduceConfig {
-  ReduceConfig(const std::vector<int>& origin_reduce_dims,
-               const std::vector<int>& origin_x_dim)
-      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
-
-  // get the parameters of reduceKernel
-  void Run() {
-    // step1: update the reduce_dim left_dim and x_dim
-    SetReduceDim();
-
-    // step2: get the strides of dim for reduceAny and reduceLastDim
-    SetStrides();
-
-    // step3: get the type of reduce
-    SetReduceType();
-
-    // step4: set the block and grid for launch kernel
-    SetBlockDim();
-  }
-
-  // when should_reduce_again is true, we need malloc temp space for temp data
-  void SetOutputData(Ty* y_data,
-                     const paddle::platform::Place& place,
-                     phi::DenseTensor* tmp) {
-    if (should_reduce_again) {
-      tmp->ResizeAndAllocate(phi::make_ddim(
-          {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
-      output_data = tmp->mutable_data<Ty>(place);
-    } else {
-      output_data = y_data;
-    }
-  }
-
- private:
-  // set reduce_dim, left_dim and update x_dim
-  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
-  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
-  void SetReduceDim() {
-    std::set<int> reduce_set;
-    for (auto e : reduce_dims_origin) {
-      auto pos = e >= 0 ? e : e + x_dim.size();
-      reduce_set.insert(pos);
-    }
-
-    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
-    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
-
-    // update reduce_dim and x_dim
-    std::vector<int> x_new_dim;
-
-    reduce_dim.push_back(reduce_dim_temp[0]);
-    x_new_dim.push_back(x_dim[0]);
-
-    int idx_reduce = 1;
-    int num = 0;
-
-    if (reduce_dim_temp.size() > 1) {
-      for (int i = 1; i < x_dim.size(); i++) {
-        if ((idx_reduce < reduce_dim_temp.size()) &&
-            (i == reduce_dim_temp[idx_reduce])) {
-          int result =
-              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
-          bool is_equal = ((result - num) == 1);
-          if (is_equal) {
-            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-            num++;
-          } else {
-            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
-            x_new_dim.push_back(x_dim[i]);
-          }
-          idx_reduce++;
-        } else {
-          x_new_dim.push_back(x_dim[i]);
-        }
-      }
-    } else {
-      x_new_dim = x_dim;
-    }
-
-    // update x_dim
-    x_dim = x_new_dim;
-    std::vector<int>().swap(x_new_dim);
-
-    std::vector<int> reduce_dim_new;
-    int is_reduced = 0;
-    for (auto e : reduce_dim) {
-      is_reduced |= 1 << e;
-    }
-
-    std::vector<int>().swap(reduce_dim);
-
-    for (int i = 0; i < x_dim.size(); i++) {
-      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-        x_new_dim.push_back(x_dim[i]);
-        if ((is_reduced >> i) & 1)
-          reduce_dim_new.push_back(x_new_dim.size() - 1);
-      } else {
-        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-      }
-    }
-
-    x_dim = x_new_dim;
-    reduce_dim = reduce_dim_new;
-
-    int x_rank = static_cast<int>(x_dim.size());
-    std::set<int> left_set;
-
-    for (int i = 0; i < x_rank; ++i) {
-      left_set.insert(i);
-    }
-
-    for (auto e : reduce_dim) {
-      left_set.erase(e);
-    }
-
-    left_dim.assign(left_set.begin(), left_set.end());
-
-    // if the last dim gets involved in reduction
-    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
-  }
-
-  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
-  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
-  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
-  //     left_strides = [1]
-  void SetStrides() {
-    std::vector<int> idx_dim;
-    for (int i = 0; i < x_dim.size(); i++) {
-      idx_dim.push_back(i);
-    }
-
-    x_strides = details::GetDimStrides(x_dim, idx_dim);
-    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
-    left_strides = details::GetDimStrides(x_dim, left_dim);
-    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
-
-    left_num = 1;
-    if (left_dim.size()) {
-      left_num = left_strides[0] * x_dim[left_dim[0]];
-    }
-  }
-
-  // get the reduceType
-  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
-  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
-  //     x_dim = [8] reduce_dim = [0] --> reduceAll
-  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
-  void SetReduceType() {
-    int rank = x_dim.size();
-    int reduce_rank = reduce_dim.size();
-    bool is_last_dim =
-        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
-    if (rank == reduce_rank || is_last_dim) {
-#ifdef PADDLE_WITH_XPU_KP
-      reduce_type = static_cast<int>(ReduceType::kReduceAny);
-#else
-      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
-#endif
-    } else if (reduce_rank == 1) {
-// ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU_KP
-      if (reduce_dim[0] == 0) {
-        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-      } else {
-        reduce_type = static_cast<int>(ReduceType::kReduceAny);
-      }
-#else
-      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-#endif
-    } else {
-      reduce_type = static_cast<int>(ReduceType::kReduceAny);
-    }
-  }
-
-#ifndef PADDLE_WITH_XPU_KP
-  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
-    constexpr int min_reduce_num_per_thread = 16;
-    constexpr int max_reduce_num_per_thread = 256;
-    constexpr int max_num_threads = kps::details::kReduceMaxThread;
-
-    // set block size.
-    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
-    //    will process the reduction for one output.
-    //    The number of output for one block is blockDim.y;
-    // 2. If reduce_last_dim == false, different threadIdx.x will process
-    //    different reduction and gets the output separately. If it is
-    //    necessary, it should reduce in block y.
-    //    The number of output for one block is blockDim.x;
-    int block_x, block_y;
-    int grid_num, reduce_num_per_thread;
-    if (reduce_last_dim) {
-      block_x = details::GetBlockDim(reduce_num);
-      block_y = details::GetBlockDim(left_num);
-      block_dim->x = block_x;
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      grid_num = details::AlignUp(left_num, block_dim->y);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
-    } else {
-      block_x = details::GetBlockDim(left_num);
-      block_y = details::GetBlockDim(reduce_num);
-      block_dim->x = std::min(block_x, 32);
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      block_dim->x =
-          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
-      grid_num = details::AlignUp(left_num, block_dim->x);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
-    }
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    int num_threads = block_dim->x * block_dim->y;
-    int max_num_blocks = max_threads / num_threads;
-
-    // set grid size.
-    // Whether to set grid.y larger than 1, there are 3 following rules:
-    // 1. The number that each thread process should no less than
-    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
-    // 2. It should maximize the utilization of SM.
-    // So we choose the minimum between input_split_num_1 and input_split_num_3
-    // to make each thread process as mush data as possible. Meanwhile,
-    // the number cannot be larger than max_reduce_num_per_thread, so we
-    // choose the maximum between the result above and input_split_num_2.
-    int input_split_num_1 =
-        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
-    int input_split_num_2 =
-        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
-    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
-
-    grid_dim->x = grid_num;
-    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
-                           input_split_num_2);
-    // if grid.y > 1, we need launch reduce kernel again.
-    if (grid_dim->y > 1) {
-      should_reduce_again = true;
-    }
-  }
-
-  // set block and grid for launch kernel
-  // for ReduceHigherDim: if block is enough -> splite reduce_num
-  //                     else init block(32, 1) grid(block_num, 1)
-  // for others: block(block_num, 1) , grid(left_num, 1)
-  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
-    int last_dim_num = x_dim.back();
-    // update left_num
-    int grid_z = left_num / last_dim_num;
-    left_num = last_dim_num;
-    grid_dim->z = grid_z;
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    // init
-    int num_block = (max_threads / left_num);
-    block_dim->x = details::GetBlockDim(left_num);
-    grid_dim->x = details::AlignUp(left_num, block_dim->x);
-    blocking_size = reduce_num;
-
-    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-      blocking_size = details::GetLastPow2(reduce_num / num_block);
-      if (blocking_size <= 1) {
-        blocking_size = details::GetLastPow2(sqrt(reduce_num));
-      } else if (blocking_size * 2 < reduce_num) {
-        blocking_size *= 2;
-      }
-      should_reduce_again = true;
-      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
-    }
-  }
-#endif
-
-  void SetBlockDim() {
-    // init
-    int block_num = details::GetBlockDim(reduce_num);
-    should_reduce_again = false;
-    dim3 block_dim(block_num, 1, 1);
-    dim3 grid_dim(left_num, 1, 1);
-    blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU_KP
-    if (reduce_last_dim) {
-      block_dim.x = 64;
-      block_dim.y = reduce_num;
-      grid_dim.x = 1;
-      grid_dim.y = 8;
-    } else {
-      block_dim.x = 64;
-      block_dim.y = left_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    }
-#else
-    if (reduce_type == ReduceType::kReduceHigherDim) {
-      SetBlockDimForHigher(&block_dim, &grid_dim);
-    } else {
-      SetBlockDimForReduceAny(&block_dim, &grid_dim);
-    }
-#endif
-
-    block = block_dim;
-    grid = grid_dim;
-  }
-
- public:
-  std::vector<int> reduce_dims_origin;
-  std::vector<int> reduce_dim;
-  std::vector<int> x_dim;
-  std::vector<int> left_dim;
-  std::vector<int> x_strides;
-  std::vector<int> left_strides;
-  std::vector<int> reduce_strides;
-
-  int reduce_type;
-  int reduce_num;
-  int left_num;
-  int blocking_size;
-  bool should_reduce_again;
-  bool reduce_last_dim;
-
-  Ty* output_data;
-
-  dim3 block;
-  dim3 grid;
-};
-
-// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-// function will be used
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp,
-          typename Calculator>
-__global__ void ReduceAnyKernel(const Tx* x,
-                                Ty* y,
-                                ReduceOp reducer,
-                                TransformOp transformer,
-                                MPType init,
-                                int reduce_num,
-                                int left_num,
-                                bool reduce_last_dim,
-                                const Calculator reduce_index_calculator,
-                                const Calculator left_index_calculator,
-                                const kps::DimConfig dim) {
-  int input_idx, left_idx, stride;
-  int block_size = 0;
-  bool need_store = true;
-  int loop_left = 0;
-  int tid = 0;
-  // the last dim gets involved in reduction
-  int store_offset = 0;
-  int stride_left = 0;
-  if (reduce_last_dim) {
-    auto block = ReduceIndexMapping<true>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimX();
-    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
-    stride = block.GridDimY() * block.BlockDimX();
-    block_size = block.BlockDimX();
-    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = 1;
-    tid = THREAD_ID_X;
-  } else {
-    auto block = ReduceIndexMapping<false>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimY();
-    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
-    stride = block.GridDimY() * block.BlockDimY();
-    block_size = block.BlockDimY();
-    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = block.BlockDimX() * block.GridDimX();
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = THREAD_ID_Y;
-  }
-  // calculate the offset, means the addr where each thread really start.
-  // 1. reduce for each thread
-  MPType input_compute[REDUCE_VEC_SIZE];
-  Tx input_reg[REDUCE_VEC_SIZE];
-  int input_idx_tmp = input_idx;
-  for (int i = 0; i < loop_left; i += stride_left) {
-    int input_offset = left_index_calculator(left_idx + i);
-    const _ptr_ Tx* input = x + input_offset;
-    MPType reduce_var = init;
-    // load REDUCE_VEC_SIZE data once, and then compute
-    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
-    input_idx = input_idx_tmp;
-    for (; input_idx + block_size < bound;
-         input_idx += REDUCE_VEC_SIZE * stride) {
-      kps::ReadDataReduce<Tx,
-                          Tx,
-                          1,
-                          REDUCE_VEC_SIZE,
-                          1,
-                          1,
-                          Calculator,
-                          kps::IdentityFunctor<Tx>,
-                          false>(&input_reg[0],
-                                 input,
-                                 input_idx,
-                                 reduce_index_calculator,
-                                 1,
-                                 reduce_num,
-                                 1,
-                                 stride,
-                                 kps::IdentityFunctor<Tx>(),
-                                 reduce_last_dim);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &input_compute[0], &input_reg[0], transformer);
-      kps::Reduce<MPType,
-                  REDUCE_VEC_SIZE,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-    }
-
-    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
-    kps::ReadDataReduce<Tx,
-                        MPType,
-                        1,
-                        REDUCE_VEC_SIZE,
-                        1,
-                        1,
-                        Calculator,
-                        TransformOp,
-                        true>(&input_compute[0],
-                              input,
-                              input_idx,
-                              reduce_index_calculator,
-                              1,
-                              reduce_num - input_idx,
-                              1,
-                              stride,
-                              transformer,
-                              reduce_last_dim);
-    kps::Reduce<MPType,
-                REDUCE_VEC_SIZE,
-                1,
-                1,
-                ReduceOp,
-                kps::details::ReduceMode::kLocalMode>(
-        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-
-    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
-        &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp>
-__global__ void ReduceHigherDimKernel(const Tx* x,
-                                      Ty* y,
-                                      ReduceOp reducer,
-                                      TransformOp transformer,
-                                      MPType init,
-                                      int reduce_num,
-                                      int left_num,
-                                      int blocking_size,
-                                      const kps::DimConfig dim) {
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  auto block = ReduceIndexMapping<false>(dim);
-  int idy = block.BlockIdY() * blocking_size;
-  int idx = block.BlockIdX() * block.BlockDimX();
-  int idz = BLOCK_ID_Z * left_num;
-  int stride = dim.split_num_x * dim.deal_size_x;
-  int size = left_num - dim.rem_x;
-  int loop_size = min(reduce_num - idy, blocking_size);
-  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
-  int block_offset = idy * left_num + idz * reduce_num;
-  const _ptr_ Tx* input = x + block_offset;
-  Tx reduce_input;
-  for (; idx < size; idx += stride) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
-                                            input + loop_idx * left_num + idx,
-                                            block.BlockDimX(),
-                                            1,
-                                            1,
-                                            left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, false>(
-        y + store_offset + idx, &result, block.BlockDimX());
-  }
-
-  if (idx < left_num) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
-                                           input + loop_idx * left_num + idx,
-                                           dim.rem_x,
-                                           1,
-                                           1,
-                                           left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, true>(
-        y + store_offset + idx, &result, dim.rem_x);
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp>
-static void LaunchReduceKernel(const Tx* x_data,
-                               Ty* y_data,
-                               const ReduceOp& reducer,
-                               const TransformOp& transform,
-                               MPType init,
-                               KPStream stream,
-                               ReduceConfig<Ty> config) {
-  if (config.reduce_type == kReduceLastDim) {
-    int stride_reduce = 1;
-    int stride_left = config.reduce_num;
-    // for higher performance
-    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
-    auto left_index_calculator = OneDimIndexCal(stride_left);
-
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.block.y,
-                                        0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#else
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#endif
-
-  } else {
-    int reduce_rank = config.reduce_strides.size();
-    int left_rank = config.left_strides.size();
-    auto reduce_index_calculator = IndexCalculator(reduce_rank,
-                                                   config.reduce_dim,
-                                                   config.reduce_strides,
-                                                   config.x_strides);
-    auto left_index_calculator = IndexCalculator(
-        left_rank, config.left_dim, config.left_strides, config.x_strides);
-
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.block.y,
-                                        0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#else
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#endif
-  }
-
-  if (config.should_reduce_again) {
-    dim3 block;
-    dim3 grid;
-    if (config.reduce_last_dim) {
-      block = dim3(32, 1, 1);
-      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
-    } else {
-      block = dim3(config.block.x, 1, 1);
-      grid = dim3(config.grid.x, 1, config.grid.z);
-    }
-
-    auto last_index = OneDimIndexCal(1);
-    auto first_index = OneDimIndexCal(config.left_num);
-    kps::DimConfig dim =
-        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-    dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#endif
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
-                               void>::type
-CubTensorReduceImpl(const Tx* x_data,
-                    Ty* y_data,
-                    const TransformOp& transform,
-                    int reduce_num,
-                    const paddle::platform::Place& place,
-                    KPStream stream) {
-  auto reducer = ReduceOp<Ty>();
-  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
-                                                                  transform);
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Reduce(nullptr,
-                            temp_storage_bytes,
-                            trans_x,
-                            y_data,
-                            reduce_num,
-                            reducer,
-                            reducer.initial(),
-                            stream);
-
-  phi::DenseTensor tmp = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      phi::DenseTensorMeta(
-          phi::DataType::UINT8,
-          phi::make_ddim({static_cast<int64_t>(temp_storage_bytes)})));
-
-  auto* temp_storage = tmp.mutable_data<uint8_t>(place);
-
-  cub::DeviceReduce::Reduce(temp_storage,
-                            temp_storage_bytes,
-                            trans_x,
-                            y_data,
-                            reduce_num,
-                            reducer,
-                            reducer.initial(),
-                            stream);
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
-                               void>::type
-CubTensorReduceImpl(const Tx* x_data,
-                    Ty* y_data,
-                    const TransformOp& transform,
-                    int reduce_num,
-                    const paddle::platform::Place& place,
-                    KPStream stream) {
-  PADDLE_THROW(phi::errors::InvalidArgument(
-      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const phi::DenseTensor& x,
-                      phi::DenseTensor* y,
-                      const TransformOp& transform,
-                      const std::vector<int>& origin_reduce_dims,
-                      KPStream stream) {
-  y->mutable_data<Ty>(x.place());
-
-  auto x_dim = phi::vectorize<int>(x.dims());
-  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
-  config.Run();
-  int numel = x.numel();
-  // after config.run()
-  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
-  // temp_output should be stored temp_data in output_data space or stored in
-  // y_data;
-
-  phi::DDim tmp_ddim;
-  phi::DenseTensor tmp = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(y->place()),
-      phi::DenseTensorMeta(y->dtype(), tmp_ddim, y->layout()));
-
-  auto x_data = x.data<Tx>();
-  auto y_data = y->data<Ty>();
-
-  if (config.reduce_num == 1) {
-    std::vector<const DenseTensor*> inputs = {&x};
-    std::vector<DenseTensor*> outputs = {y};
-    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
-    return;
-  }
-
-  config.SetOutputData(y_data, x.place(), &tmp);
-  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
-  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
-#ifndef PADDLE_WITH_XPU_KP
-  if (use_cub_reduce) {
-    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
-        x_data, y_data, transform, config.reduce_num, x.place(), stream);
-    return;
-  }
-#endif
-
-  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
-  auto reducer = ReduceOp<MPType>();
-  // launch ReduceHigherDimKernel
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
-  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
-  //     32
-  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
-  if (config.reduce_type == ReduceType::kReduceHigherDim) {
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.blocking_size,
-                                        0);
-    dim.SetRem(config.left_num % config.block.x,
-               config.reduce_num % config.blocking_size,
-               0);
-
-#ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<Tx,
-                          Ty,
-                          MPType,
-                          ReduceOp<MPType>,
-                          TransformOp><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
-
-    if (config.should_reduce_again) {
-      dim3 block = dim3(config.block.x, 1, 1);
-      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
-      kps::DimConfig dim2 =
-          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-      dim2.SetRem(config.left_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU_KP
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#endif
-    }
-    return;
-  }
-
-  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-  // function will be used
-  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
-      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
-}
-
-}  // namespace kernels
 
 template <typename T,
           template <typename> class ReduceOp,
@@ -1252,7 +32,7 @@ void Reduce(const KPDevice& dev_ctx,
             DataType out_dtype,
             DenseTensor* out) {
   std::vector<int> reduce_dims =
-      phi::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all);
+      phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);
 
   int reduce_num = 1;
   for (auto i : reduce_dims) {
@@ -1271,10 +51,10 @@ void Reduce(const KPDevice& dev_ctx,
         "TensorReduceImpl",
         ([&] {
           using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
-          phi::kernels::TensorReduceImpl<data_t,
-                                         data_t,
-                                         ReduceOp,
-                                         TransformOp<data_t, MPType>>(
+          phi::funcs::TensorReduceImpl<data_t,
+                                       data_t,
+                                       ReduceOp,
+                                       TransformOp<data_t, MPType>>(
               dev_ctx,
               tmp_tensor,
               out,
@@ -1284,7 +64,7 @@ void Reduce(const KPDevice& dev_ctx,
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::kernels::TensorReduceImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
+    phi::funcs::TensorReduceImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
         dev_ctx,
         x,
         out,
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 7ac7c451b00..4266f0174ff 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
@@ -34,7 +34,7 @@ void TraceKernel(const Context& ctx,
     auto stream = ctx.stream();
     std::vector<int> reduce_dims;
     reduce_dims.push_back(out->dims().size());
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
         ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
   } else {
     phi::funcs::SetConstant<Context, T> functor;
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index f2549c171dd..7c8d10e0565 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -60,7 +60,7 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
     auto stream = dev_ctx.stream();
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
         dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
   }
 };
-- 
GitLab


From b9672a1eeef8495313efcd5d4e8be2383571881c Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 4 Mar 2022 13:11:09 +0800
Subject: [PATCH 116/272] clean distribution_helper, index_impl, aligned_vector
 code in fluid (#40071)

* clean distribution_helper, index_impl, aligned_vector code in fluid

* fix conflicts
---
 paddle/fluid/operators/distribution_helper.h  | 244 ------------------
 paddle/fluid/operators/dropout_impl.cu.h      |  24 +-
 paddle/fluid/operators/exponential_op.cc      |   2 +-
 paddle/fluid/operators/exponential_op.cu      |   6 +-
 paddle/fluid/operators/exponential_op.h       |   2 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |   6 +-
 .../operators/fused/fused_dropout_act_bias.h  |  30 +--
 .../operators/fused/fused_dropout_common.h    |   2 +-
 .../fused_layernorm_residual_dropout_bias.h   |  46 ++--
 .../fused/fused_residual_dropout_bias.h       |  41 ++-
 paddle/fluid/operators/gaussian_random_op.cu  |   9 +-
 paddle/fluid/operators/gelu_op.cu             |   8 +-
 paddle/fluid/operators/index_impl.cu.h        |   6 +-
 paddle/fluid/operators/layer_norm_kernel.cu.h |  44 ++--
 .../operators/optimizers/cast_with_ptr.h      |   3 +-
 .../optimizers/distributed_fused_lamb_op.cu   |  64 ++---
 .../operators/optimizers/lars_momentum_op.cu  |   4 +-
 paddle/fluid/operators/uniform_random_op.h    |  13 +-
 paddle/fluid/platform/fast_divmod.h           |   4 +-
 paddle/phi/kernels/funcs/broadcast_function.h |   8 +-
 .../phi/kernels/funcs/distribution_helper.h   |   1 +
 paddle/phi/kernels/funcs/elementwise_base.h   |  11 +-
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/cast_kernel.cu         |   2 +-
 24 files changed, 166 insertions(+), 417 deletions(-)
 delete mode 100644 paddle/fluid/operators/distribution_helper.h

diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
deleted file mode 100644
index c13bf687af2..00000000000
--- a/paddle/fluid/operators/distribution_helper.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef __NVCC__
-#include <curand_kernel.h>
-#endif
-#ifdef __HIPCC__
-#include <hiprand_kernel.h>
-#endif
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/hostdevice.h"
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-#endif
-
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-namespace paddle {
-namespace distribution {
-
-using Tensor = framework::Tensor;
-
-/********************* Transformation Function **********************/
-template <typename T>
-struct exponential_transform {
-  explicit exponential_transform(T lambda) : lambda_(lambda) {}
-
-  HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (std::is_same<T, double>::value) {
-      return static_cast<T>(-1.0) / lambda_ * log(val);
-    } else {
-      return static_cast<T>(-1.0) / lambda_ * __logf(val);
-    }
-#else
-    return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
-#endif
-  }
-
- private:
-  T lambda_;
-};
-
-template <typename T>
-struct uniform_transform {
-  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
-
-  HOSTDEVICE inline T operator()(T val) const {
-    if (UNLIKELY(val == static_cast<T>(1.0))) {
-      return min_;
-    } else {
-      return val * range_ + min_;
-    }
-  }
-
- private:
-  T range_;
-  T min_;
-};
-
-template <typename T>
-struct normal_transform {
-  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
-
-  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
-
- private:
-  T mean_;
-  T std_;
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-namespace kps = phi::kps;
-
-/*********************** Distribution Function *************************/
-template <typename T>
-struct uniform_distribution;
-
-template <typename T>
-struct normal_distribution;
-
-#if defined(__NVCC__)
-template <>
-struct uniform_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct uniform_distribution<double> {
-  __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-template <>
-struct normal_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_normal4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct normal_distribution<double> {
-  __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_normal2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-#else
-template <>
-struct uniform_distribution<float> {
-  __device__ inline float4 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_uniform4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct uniform_distribution<double> {
-  __device__ inline double2 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_uniform2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-template <>
-struct normal_distribution<float> {
-  __device__ inline float4 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_normal4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct normal_distribution<double> {
-  __device__ inline double2 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_normal2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-#endif
-
-/******** Launch GPU function of distribution and transformation *********/
-template <typename T, typename DistOp, typename TransformOp>
-__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans, T *out_data,
-                                   size_t stride) {
-  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
-  static constexpr int kCount = DistOp::kReturnsCount;
-#if defined(__NVCC__)
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = curandStatePhilox4_32_10_t;
-#else
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = hiprandStatePhilox4_32_10_t;
-#endif
-  size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
-  T result[kCount];
-  for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
-                                                           trans);
-    kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, stride, 1);
-    __syncthreads();
-  }
-}
-
-template <typename T, typename DistOp, typename TransformOp>
-void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
-                                Tensor *out, DistOp dist, TransformOp trans) {
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-  auto size = out->numel();
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-  size_t block_size = 256;
-  size_t expect_grid_size = (size + block_size - 1) / block_size;
-  const auto &prop = platform::GetDeviceProperties(device_id);
-  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
-                         prop.multiProcessorCount;
-  size_t grid_size =
-      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
-
-  size_t total_thread = block_size * grid_size;
-  size_t curand4_loop_times =
-      (size + 4 * total_thread - 1) / (4 * total_thread);
-  // 'increment' shoulde be multiple of 4
-  uint64_t increment = curand4_loop_times * 4;
-
-  auto seed_offset = gen_cuda->IncrementOffset(increment);
-  uint64_t seed = seed_offset.first;
-  uint64_t offset = seed_offset.second;
-
-  DistributionKernel<
-      T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data, total_thread);
-}
-
-#endif
-
-}  // namespace distribution
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index cdcf683fb92..dcdab033e8f 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,8 +34,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
@@ -86,8 +86,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                           bool is_upscale_in_train,
                                           uint64_t increment) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
 #ifdef PADDLE_WITH_HIP
   int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -102,7 +102,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
   MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
   for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
     LoadT src_val;
-    platform::Load<T, VecSize>(&src[i], &src_val);
+    phi::Load<T, VecSize>(&src[i], &src_val);
 
 #ifdef PADDLE_WITH_HIP
     float4 rand = hiprand_uniform4(&state);
@@ -126,8 +126,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
       }
     }
 
-    platform::Store<T, VecSize>(dst_val, &dst[i]);
-    platform::Store<MaskType, VecSize>(mask_val, &mask[i]);
+    phi::Store<T, VecSize>(dst_val, &dst[i]);
+    phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
   }
 }
 
@@ -153,16 +153,16 @@ __global__ void DropoutGradCUDAKernel(
     const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
     T* dx) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_val;
-    platform::Load<T, VecSize>(&dout[i], &dout_val);
+    phi::Load<T, VecSize>(&dout[i], &dout_val);
 
     MaskLoadT mask_val;
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_val);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
 
     LoadT dx_val;
 
@@ -172,7 +172,7 @@ __global__ void DropoutGradCUDAKernel(
                                  static_cast<MT>(mask_val[j]) * factor);
     }
 
-    platform::Store<T, VecSize>(dx_val, &dx[i]);
+    phi::Store<T, VecSize>(dx_val, &dx[i]);
   }
 }
 
@@ -219,7 +219,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     uint64_t increment;
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
-    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
     auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
         ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc
index ee456dcdafb..1a48a676785 100644
--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -76,7 +76,7 @@ class ExponentialKernel<platform::CPUDeviceContext, T>
     auto engine = gen->GetCPUEngine();
 
     std::uniform_real_distribution<T> uniform(0.0, 1.0);
-    distribution::exponential_transform<T> trans(lambda);
+    phi::funcs::exponential_transform<T> trans(lambda);
     for (int64_t i = 0; i < size; ++i) {
       out_data[i] = trans(uniform(*engine));
     }
diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu
index 8b989501e4f..d5abbf9a26a 100644
--- a/paddle/fluid/operators/exponential_op.cu
+++ b/paddle/fluid/operators/exponential_op.cu
@@ -26,9 +26,9 @@ class ExponentialKernel<platform::CUDADeviceContext, T>
     auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
     T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
 
-    distribution::uniform_distribution<T> dist;
-    distribution::exponential_transform<T> trans(lambda);
-    distribution::distribution_and_transform<T>(dev_cxt, out, dist, trans);
+    phi::funcs::uniform_distribution<T> dist;
+    phi::funcs::exponential_transform<T> trans(lambda);
+    phi::funcs::distribution_and_transform<T>(dev_cxt, out, dist, trans);
   }
 };
 
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
index fbcabc594db..7ded174a9f4 100644
--- a/paddle/fluid/operators/exponential_op.h
+++ b/paddle/fluid/operators/exponential_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 51cf3bce1ce..3a2de0c4a09 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary(
 template <typename T>
 void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
                            const T* in0, const T* in1, T* out) {
-  int in_vec_size = std::min(platform::GetVectorizedSize<T>(in0),
-                             platform::GetVectorizedSize<T>(in1));
-  int out_vec_size = std::min(4, platform::GetVectorizedSize<T>(out));
+  int in_vec_size =
+      std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1));
+  int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out));
   int vec_size = std::min(out_vec_size, in_vec_size);
 
   int numel = m * n;
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
index 994601a2f06..9f5a1bad047 100755
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
                                     const T factor, const int64_t size, T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_vec;
     LoadT src_vec;
     MaskLoadT mask_vec;
 
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
-    platform::Load<T, VecSize>(&src[i], &src_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&src[i], &src_vec);
 
     StoreT dx_vec;
 #pragma unroll
@@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
       T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
       dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
     }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
                                         T *dx, T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
   if (col_id * VecSize < cols) {
@@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
       LoadT bias_vec;
       MaskLoadT mask_vec;
 
-      platform::Load<T, VecSize>(&dout[index], &dout_vec);
-      platform::Load<T, VecSize>(&src[index], &src_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
-      platform::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
+      phi::Load<T, VecSize>(&dout[index], &dout_vec);
+      phi::Load<T, VecSize>(&src[index], &src_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
 
       StoreT dx_vec;
 #pragma unroll
@@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index f79277e4e8f..6bf3a7114f4 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index ceba3accca7..d53a24a57e3 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -42,12 +42,12 @@ __device__ void CalcLayernormY(
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
     T *y, const int row_id, const int col_id, const int cols,
     const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using LoadU = platform::AlignedVector<U, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using LoadU = phi::AlignedVector<U, VecSize>;
   using LoadScaleOrBias =
-      platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                              VecSize>;
+      phi::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                         VecSize>;
   for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
     LoadScaleOrBias scale_vec;
     LoadScaleOrBias bias_vec;
@@ -60,15 +60,15 @@ __device__ void CalcLayernormY(
           static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
     }
     // vectorize load data from global
-    platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
+    phi::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
 
     if (scale != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&scale[i], &scale_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &scale[i], &scale_vec);
     }
     if (bias != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&bias[i], &bias_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &bias[i], &bias_vec);
     }
 
     StoreT y_vec;
@@ -78,7 +78,7 @@ __device__ void CalcLayernormY(
                              (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
                          static_cast<U>(bias_vec[ii]));
     }
-    platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
+    phi::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
   }
 }
 
@@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     Vec residual[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
-      platform::Load<T, VecSize>(
-          residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &residual[it]);
       col += THREADS_PER_ROW;
     }
 
@@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 // store dropout_residual_out and mask_out
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(
+      phi::Store<T, VecSize>(
           x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
-      platform::Store<MaskType, VecSize>(
+      phi::Store<MaskType, VecSize>(
           mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
@@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 1b135ad6098..1d3085a013f 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test,
     typename details::MPTypeTrait<T>::Type *mean_val,
     typename details::MPTypeTrait<T>::Type *var_val, Functor act_func) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
   using U = typename details::MPTypeTrait<T>::Type;
 
   LoadT src_vec;
@@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     residual_vec[ii] = static_cast<T>(0);
   }
   // vectorize load data from global
-  platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
   if (residual) {
-    platform::Load<T, VecSize>(&residual[row_id * cols + col_id],
-                               &residual_vec);
+    phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
   }
 
   if (bias) {
-    platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
+    phi::Load<T, VecSize>(&bias[col_id], &bias_vec);
   }
 
   MaskStoreT mask_vec;
@@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
   }
 
   // store result to global
-  platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
   if (!is_test) {
-    platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
+    phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
   }
 }
 
@@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
                                          T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_vec;
     MaskLoadT mask_vec;
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
 
     StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
       dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
     }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
                                              T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
@@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
       LoadT out_vec;
       MaskLoadT mask_vec;
       StoreT dx_vec;
-      platform::Load<T, VecSize>(&dout[index], &out_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&dout[index], &out_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
 
 #pragma unroll
       for (int i = 0; i < VecSize; i++) {
@@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
         tmp_sum[i] += out_vec[i];
       }
 
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index d419bd70e67..717ec774414 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -19,9 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
+
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 
 DECLARE_bool(use_curand);
 
@@ -79,10 +80,10 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
       int64_t gen_offset = size * seed_offset.second;
       auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
                                        seed_offset.second);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
       auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 6b778eee434..ef836ab72f0 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
       static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
   size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
   for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
     ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
@@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
       static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
   size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
   for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
     ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
     ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
 #pragma unroll
@@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
   do {                                                                        \
     constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
     if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
         is_aligned(y, kAlignment)) {                                          \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
@@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
   do {                                                                        \
     constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
     if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
         is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
         is_aligned(x_g, kAlignment)) {                                        \
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index 2e3e6569ef5..bb26e2f445e 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace paddle {
@@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   int numel = out->numel();
   T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
   if (numel <= 0) return;
-  int vec_size = paddle::platform::GetVectorizedSize(out_data);
+  int vec_size = phi::GetVectorizedSize(out_data);
 #ifdef PADDLE_WITH_XPU_KP
   int block = 64;
   int grid = 8;
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 62c21dd2eee..412ae3c49b5 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace paddle {
 namespace operators {
@@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     Vec x[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
       col += THREADS_PER_ROW;
     }
     U xf[LDGS * VecSize];
@@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr,
     T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr,
     T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   int col = c;
 #pragma unroll
   for (int it = 0; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     int col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      platform::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &dout[it]);
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &dout[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
       if (isFusedDropoutResidualLn) {
-        platform::Load<MaskType, VecSize>(
+        phi::Load<MaskType, VecSize>(
             mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
       }
 
@@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  dx_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize);
       if (isFusedDropoutResidualLn) {
-        platform::Store<T, VecSize>(
+        phi::Store<T, VecSize>(
             dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
       }
       col += THREADS_PER_ROW;
@@ -641,7 +637,7 @@ template <
 __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
     ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
-  using Vec = platform::AlignedVector<U, VecSize>;
+  using Vec = phi::AlignedVector<U, VecSize>;
   static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");
 
   const int tidx = threadIdx.x;
@@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     for (int row = r; row < rows; row += ROWS_PER_CTA) {
       Vec dg;
       Vec db;
-      platform::Load<U, VecSize>(dg_part_ptr, &dg);
-      platform::Load<U, VecSize>(db_part_ptr, &db);
+      phi::Load<U, VecSize>(dg_part_ptr, &dg);
+      phi::Load<U, VecSize>(db_part_ptr, &db);
       dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
       db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
 
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index ab8b4f2b8f4..a3fbb0e59e2 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx,
   PADDLE_ENFORCE_NE(
       static_cast<const void *>(x), static_cast<void *>(y),
       platform::errors::InvalidArgument("Inplace cast is not supported yet."));
-  int vec_size =
-      std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y));
+  int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
   switch (vec_size) {
     case 4:
       return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 8bb4606ffff..5b60f65442b 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -19,11 +19,11 @@
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
 #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
 #include "paddle/fluid/operators/tensor_to_string.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -66,8 +66,8 @@ struct L2NormFunctor {
     int i;
     for (i = threadIdx.x * VecSize; i + VecSize <= size;
          i += (BlockDim * VecSize)) {
-      platform::AlignedVector<T, VecSize> tmp_vec;
-      platform::Load(ptr + i, &tmp_vec);
+      phi::AlignedVector<T, VecSize> tmp_vec;
+      phi::Load(ptr + i, &tmp_vec);
 #pragma unroll
       for (int j = 0; j < VecSize; ++j) {
         auto tmp = static_cast<MT>(tmp_vec[j]);
@@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   constexpr int max_load_bits = 128;
   int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
   auto address = reinterpret_cast<uintptr_t>(ptr);
-  constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
-  constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
-  constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  constexpr int vec8 = alignof(phi::AlignedVector<T, 8>);
+  constexpr int vec4 = alignof(phi::AlignedVector<T, 4>);
+  constexpr int vec2 = alignof(phi::AlignedVector<T, 2>);
   chunk_size *= sizeof(T);
   if (address % vec8 == 0 && chunk_size % vec8 == 0) {
     return std::min(8, valid_vec_size);
@@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
   int stride = blockDim.x * gridDim.x * VecSize;
 
   for (; i + VecSize <= num; i += stride) {
-    platform::AlignedVector<T1, VecSize> x_vec;
-    platform::AlignedVector<T1, VecSize> y_vec;
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T1, VecSize> y_vec;
 
-    platform::Load(x + i, &x_vec);
+    phi::Load(x + i, &x_vec);
 #pragma unroll
     for (int j = 0; j < VecSize; ++j) {
       y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
     }
-    platform::Store(y_vec, y + i);
+    phi::Store(y_vec, y + i);
   }
 
   for (; i < num; ++i) {
@@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
   int stride = blockDim.x * gridDim.x * VecSize;
 
   for (; i + VecSize <= num; i += stride) {
-    platform::AlignedVector<T, VecSize> param_vec;
-    platform::AlignedVector<GradT, VecSize> grad_vec;
-    platform::AlignedVector<T, VecSize> mom1_vec;
-    platform::AlignedVector<T, VecSize> mom2_vec;
-    platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
+    phi::AlignedVector<T, VecSize> param_vec;
+    phi::AlignedVector<GradT, VecSize> grad_vec;
+    phi::AlignedVector<T, VecSize> mom1_vec;
+    phi::AlignedVector<T, VecSize> mom2_vec;
+    phi::AlignedVector<T, VecSize> trust_ratio_div_vec;
 
     T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
     if (cur_weight_decay != static_cast<T>(0.0)) {
-      platform::Load(param_p + i, &param_vec);
+      phi::Load(param_p + i, &param_vec);
     } else {
 #pragma unroll
       for (int j = 0; j < VecSize; ++j) {
         param_vec[j] = static_cast<T>(0);
       }
     }
-    platform::Load(grad_p + i, &grad_vec);
-    platform::Load(mom1_p + i, &mom1_vec);
-    platform::Load(mom2_p + i, &mom2_vec);
+    phi::Load(grad_p + i, &grad_vec);
+    phi::Load(mom1_p + i, &mom1_vec);
+    phi::Load(mom2_p + i, &mom2_vec);
 
 #define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2,    \
                                            __trust_ratio_div, __idx)           \
@@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
                                          mom2_vec, trust_ratio_div_vec, j);
     }
 
-    platform::Store(mom1_vec, mom1_p + i);
-    platform::Store(mom2_vec, mom2_p + i);
-    platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
+    phi::Store(mom1_vec, mom1_p + i);
+    phi::Store(mom2_vec, mom2_p + i);
+    phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
   }
 
   for (; i < num; ++i) {
@@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor {
     trust_ratio_div += offset;
 
     for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
-      platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
-      platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
+      phi::AlignedVector<MT, VecSize> trust_ratio_div_vec;
+      phi::Load(trust_ratio_div + i, &trust_ratio_div_vec);
       if (HasMasterParam) {
-        platform::AlignedVector<MT, VecSize> master_param_vec;
-        platform::Load(master_param + i, &master_param_vec);
-        platform::AlignedVector<ParamT, VecSize> param_vec;
+        phi::AlignedVector<MT, VecSize> master_param_vec;
+        phi::Load(master_param + i, &master_param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
 #pragma unroll
         for (int j = 0; j < VecSize; ++j) {
           MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
           master_param_vec[j] = p;
           param_vec[j] = static_cast<ParamT>(p);
         }
-        platform::Store(master_param_vec, master_param + i);
-        platform::Store(param_vec, param + i);
+        phi::Store(master_param_vec, master_param + i);
+        phi::Store(param_vec, param + i);
       } else {
-        platform::AlignedVector<ParamT, VecSize> param_vec;
-        platform::Load(param + i, &param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
+        phi::Load(param + i, &param_vec);
 #pragma unroll
         for (int j = 0; j < VecSize; ++j) {
           MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
           param_vec[j] = static_cast<ParamT>(p);
         }
-        platform::Store(param_vec, param + i);
+        phi::Store(param_vec, param + i);
       }
     }
 
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index df5da1b7953..fe5cd066864 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate(
     T* param_out, MT* velocity_out, const MT mu, MT local_lr,
     const MT lars_weight_decay, const MT rescale_grad, const int tid,
     const int grid_stride, const int numel, MT* master_param_out = nullptr) {
-  using VecType = paddle::platform::AlignedVector<T, VecSize>;
-  using VecMType = paddle::platform::AlignedVector<MT, VecSize>;
+  using VecType = phi::AlignedVector<T, VecSize>;
+  using VecMType = phi::AlignedVector<MT, VecSize>;
   int main = numel >> (VecSize >> 1);
   int tail_offset = main * VecSize;
 
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index a864c48ad75..b941dc21c3a 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -25,8 +25,9 @@ DECLARE_bool(use_curand);
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #endif
 
 namespace paddle {
@@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context,
   if (gen_cuda->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
       using MT = typename details::MPTypeTrait<T>::Type;
-      distribution::uniform_distribution<MT> dist;
-      distribution::uniform_transform<MT> trans(min, max);
-      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+      phi::funcs::uniform_distribution<MT> dist;
+      phi::funcs::uniform_real_transform<MT> trans(min, max);
+      phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
     } else {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
       auto func =
           UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
                                     diag_step, diag_val, gen_offset);
-      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
     }
   } else {
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
-    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+    phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
   }
 }
 #endif
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index f26c4fdd17a..39eefab774d 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
-#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define INT_BITS 32
 
@@ -25,7 +25,7 @@ namespace platform {
 struct FastDivMod {
   // 1st value represents the result of input number divides by recorded divisor
   // 2nd value represents the result of input number modulo by recorded divisor
-  using DivModT = AlignedVector<uint32_t, 2>;
+  using DivModT = phi::AlignedVector<uint32_t, 2>;
 
   FastDivMod() {}
   HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index e9fd4cf47b8..aab31cfbd55 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize(
               "%d-th output tensor`s shape is not.",
               i));
       out_vec_size = std::min(
-          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
-          out_vec_size);
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
     }
   } else {
-    out_vec_size =
-        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
   }
 
   for (auto *in : ins) {
-    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
     in_vec_size = in->dims() == (*outs)[0]->dims()
                       ? std::min(temp_size, in_vec_size)
                       : in_vec_size;
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index f0793fb9d27..3ef39dc55d1 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 235dbdd40f6..332ec0b0312 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -23,9 +23,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #define HOSTDEVICE __host__ __device__
@@ -546,9 +546,8 @@ struct VecSizeGetter {
                                const ArgsT &args,
                                int *vec_size) {
     using Type = std::tuple_element_t<Index, ArgsT>;
-    *vec_size = std::min<int>(
-        *vec_size,
-        paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
+    *vec_size = std::min<int>(*vec_size,
+                              phi::GetVectorizedSize(ins[Index]->data<Type>()));
   }
 };
 
@@ -563,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
   // The Arg VecSize=1 is to match the Unroller template.
   Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
+    vec_size =
+        std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
   }
   return vec_size;
 }
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 2b6140d2fde..79d8a7b0f34 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/bernoulli_kernel.h"
+
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #ifdef __NVCC__
@@ -28,7 +30,6 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/bernoulli_kernel.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 569a46f56d5..542234c80b5 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -20,11 +20,11 @@
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
 
-- 
GitLab


From d9dd840f09d53ba3c1f25f90ff5a6a333c9f4a31 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Fri, 4 Mar 2022 13:28:48 +0800
Subject: [PATCH 117/272] Publish header files for out user (#40150)

---
 paddle/fluid/memory/detail/buddy_allocator.cc | 1 +
 paddle/phi/backends/callback_manager.h        | 2 --
 paddle/phi/backends/device_base.cc            | 1 +
 paddle/phi/backends/event.h                   | 3 ++-
 paddle/phi/backends/stream.h                  | 3 ++-
 python/setup.py.in                            | 6 +++++-
 6 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index cdaa2b7b1df..076a9613961 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -26,6 +26,7 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/phi/backends/callback_manager.h b/paddle/phi/backends/callback_manager.h
index a15cb075668..359958b7c93 100644
--- a/paddle/phi/backends/callback_manager.h
+++ b/paddle/phi/backends/callback_manager.h
@@ -30,8 +30,6 @@
 #include <memory>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/enforce.h"
-
 namespace phi {
 
 namespace stream {
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index 6f634c58af0..14fe90192e5 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/backends/device_base.h"
 #include "gflags/gflags.h"
+#include "paddle/phi/core/enforce.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h
index f2e86343f8f..0866adcf39a 100644
--- a/paddle/phi/backends/event.h
+++ b/paddle/phi/backends/event.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
 
 namespace phi {
 
diff --git a/paddle/phi/backends/stream.h b/paddle/phi/backends/stream.h
index 6c26ab3c2d5..d1578c90ec1 100644
--- a/paddle/phi/backends/stream.h
+++ b/paddle/phi/backends/stream.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
 
 namespace phi {
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 91580614fa9..0bc32cfbc00 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -576,8 +576,12 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # phi api
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # phi common headers
     # phi level api headers (low level api)
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi')) +  # phi extension header
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/include', recursive=True)) +  # phi include headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # phi backends headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) +  # phi infermeta headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) +  # phi kernels headers
     # utila api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers
 
-- 
GitLab


From 03eb792dcdb212f96c951fb48d13b6d9147c466c Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Fri, 4 Mar 2022 13:37:09 +0800
Subject: [PATCH 118/272] =?UTF-8?q?=E3=80=90Phi=E3=80=91Migrate=20bitwise?=
 =?UTF-8?q?=5Fand/bitwise=5For/bitwise=5Fxor/bitwise=5Fnot=20op=20into=20p?=
 =?UTF-8?q?hi=20(#40031)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Migrate bitwise_and/or/xor/not op into phi

* fix CI
---
 .../operators/controlflow/CMakeLists.txt      |   2 +-
 .../fluid/operators/controlflow/bitwise_op.cc |  43 ++++---
 .../fluid/operators/controlflow/bitwise_op.cu |  74 ------------
 .../fluid/operators/controlflow/bitwise_op.h  | 112 ------------------
 paddle/phi/kernels/bitwise_kernel.h           |  44 +++++++
 paddle/phi/kernels/cpu/bitwise_kernel.cc      |  99 ++++++++++++++++
 paddle/phi/kernels/funcs/bitwise_functors.h   |  51 ++++++++
 paddle/phi/kernels/gpu/bitwise_kernel.cu      |  98 +++++++++++++++
 8 files changed, 313 insertions(+), 210 deletions(-)
 delete mode 100644 paddle/fluid/operators/controlflow/bitwise_op.cu
 delete mode 100644 paddle/fluid/operators/controlflow/bitwise_op.h
 create mode 100644 paddle/phi/kernels/bitwise_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/bitwise_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/bitwise_functors.h
 create mode 100644 paddle/phi/kernels/gpu/bitwise_kernel.cu

diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 70937069d97..0c18522fa32 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -21,4 +21,4 @@ endif()
 
 file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
-file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n")
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc
index 55cab03ea9e..4dcbbc8568f 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cc
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/bitwise_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -75,11 +75,19 @@ It operates ``%s`` on Tensor ``X`` .
   }
 };
 
-class BitwiseOp : public framework::OperatorWithKernel {
+template <typename OpComment>
+class UnaryBitwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
@@ -90,23 +98,9 @@ class BitwiseOp : public framework::OperatorWithKernel {
 };
 
 template <typename OpComment>
-class UnaryBitwiseOp : public BitwiseOp {
- public:
-  using BitwiseOp::BitwiseOp;
-
- protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-template <typename OpComment>
-class BinaryBitwiseOp : public BitwiseOp {
+class BinaryBitwiseOp : public framework::OperatorWithKernel {
  public:
-  using BitwiseOp::BitwiseOp;
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext *context) const override {
@@ -130,6 +124,14 @@ class BinaryBitwiseOp : public BitwiseOp {
     }
     context->ShareLoD("X", "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // BitwiseOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
 };
 
 }  // namespace operators
@@ -167,8 +169,3 @@ REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y");
 REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y");
 REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y");
 REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X");
-
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor);
-REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor);
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu
deleted file mode 100644
index 5d98da2c027..00000000000
--- a/paddle/fluid/operators/controlflow/bitwise_op.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/bitwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using T = typename Functor::ELEM_TYPE;
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins = {x, y};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(cuda_ctx, ins, &outs, -1,
-                                                      functor);
-  }
-};
-
-template <typename Functor>
-class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using T = typename Functor::ELEM_TYPE;
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-namespace plat = ::paddle::platform;
-
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor);
-REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor);
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h
deleted file mode 100644
index 9e652f92007..00000000000
--- a/paddle/fluid/operators/controlflow/bitwise_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
-  template <typename T>                                                      \
-  struct Bitwise##func##Functor {                                            \
-    using ELEM_TYPE = T;                                                     \
-    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
-  };                                                                         \
-                                                                             \
-  template <>                                                                \
-  struct Bitwise##func##Functor<bool> {                                      \
-    using ELEM_TYPE = bool;                                                  \
-    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
-      return a bool_expr b;                                                  \
-    }                                                                        \
-  };
-
-BITWISE_BINARY_FUNCTOR(And, &, &&)
-BITWISE_BINARY_FUNCTOR(Or, |, ||)
-BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
-#undef BITWISE_BINARY_FUNCTOR
-
-template <typename T>
-struct BitwiseNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE T operator()(const T a) const { return ~a; }
-};
-
-template <>
-struct BitwiseNotFunctor<bool> {
-  using ELEM_TYPE = bool;
-  HOSTDEVICE bool operator()(const bool a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryBitwiseOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto func = Functor();
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    ElementwiseComputeEx<Functor, DeviceContext, T>(context, x, y, -1, func,
-                                                    out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryBitwiseOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto func = Functor();
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), out->mutable_data<T>(context.GetPlace()),
-          func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-namespace plat = ::paddle::platform;
-
-#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor)                 \
-  REGISTER_OP_##dev##_KERNEL(                                                 \
-      op_type,                                                                \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<bool>>,    \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<uint8_t>>, \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int8_t>>,  \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int16_t>>, \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int>>,     \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int64_t>>);
-
-#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor)                 \
-  REGISTER_OP_##dev##_KERNEL(                                                \
-      op_type,                                                               \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<bool>>,    \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<uint8_t>>, \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int8_t>>,  \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int16_t>>, \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int>>,     \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int64_t>>);
diff --git a/paddle/phi/kernels/bitwise_kernel.h b/paddle/phi/kernels/bitwise_kernel.h
new file mode 100644
index 00000000000..17307004f36
--- /dev/null
+++ b/paddle/phi/kernels/bitwise_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BitwiseAndKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseOrKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseXorKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/bitwise_kernel.cc b/paddle/phi/kernels/cpu/bitwise_kernel.cc
new file mode 100644
index 00000000000..69f52790f77
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bitwise_kernel.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/bitwise_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/bitwise_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_BITWISE_KERNEL(op_type)                                    \
+  template <typename T, typename Context>                                 \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,                   \
+                                const DenseTensor& x,                     \
+                                const DenseTensor& y,                     \
+                                DenseTensor* out) {                       \
+    funcs::Bitwise##op_type##Functor<T> func;                             \
+    funcs::ElementwiseCompute<funcs::Bitwise##op_type##Functor<T>, T, T>( \
+        dev_ctx, x, y, -1, func, out);                                    \
+  }
+
+DEFINE_BITWISE_KERNEL(And)
+DEFINE_BITWISE_KERNEL(Or)
+DEFINE_BITWISE_KERNEL(Xor)
+#undef DEFINE_BITWISE_KERNEL
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  size_t numel = x.numel();
+  funcs::BitwiseNotFunctor<T> func;
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x_data, x_data + numel, out_data, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bitwise_and,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseAndKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_or,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseOrKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_xor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseXorKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_not,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseNotKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/funcs/bitwise_functors.h b/paddle/phi/kernels/funcs/bitwise_functors.h
new file mode 100644
index 00000000000..db1fc59f534
--- /dev/null
+++ b/paddle/phi/kernels/funcs/bitwise_functors.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
+  template <typename T>                                                      \
+  struct Bitwise##func##Functor {                                            \
+    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
+  };                                                                         \
+                                                                             \
+  template <>                                                                \
+  struct Bitwise##func##Functor<bool> {                                      \
+    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
+      return a bool_expr b;                                                  \
+    }                                                                        \
+  };
+
+BITWISE_BINARY_FUNCTOR(And, &, &&)
+BITWISE_BINARY_FUNCTOR(Or, |, ||)
+BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
+#undef BITWISE_BINARY_FUNCTOR
+
+template <typename T>
+struct BitwiseNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE T operator()(const T a) const { return ~a; }
+};
+
+template <>
+struct BitwiseNotFunctor<bool> {
+  using ELEM_TYPE = bool;
+  HOSTDEVICE bool operator()(const bool a) const { return !a; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/bitwise_kernel.cu b/paddle/phi/kernels/gpu/bitwise_kernel.cu
new file mode 100644
index 00000000000..e88ecef318a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bitwise_kernel.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/bitwise_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/bitwise_functors.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+namespace phi {
+
+#define DEFINE_BITWISE_KERNEL(op_type)                      \
+  template <typename T, typename Context>                   \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,     \
+                                const DenseTensor& x,       \
+                                const DenseTensor& y,       \
+                                DenseTensor* out) {         \
+    dev_ctx.template Alloc<T>(out);                         \
+    funcs::Bitwise##op_type##Functor<T> func;               \
+    std::vector<const DenseTensor*> ins = {&x, &y};         \
+    std::vector<DenseTensor*> outs = {out};                 \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>( \
+        dev_ctx, ins, &outs, -1, func);                     \
+  }
+
+DEFINE_BITWISE_KERNEL(And)
+DEFINE_BITWISE_KERNEL(Or)
+DEFINE_BITWISE_KERNEL(Xor)
+#undef DEFINE_BITWISE_KERNEL
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BitwiseNotFunctor<T> func;
+  funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+      dev_ctx, ins, &outs, -1, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bitwise_and,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseAndKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_or,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseOrKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_xor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseXorKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_not,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseNotKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
-- 
GitLab


From 5435459a81fe68a170ce3ad08c588c3793bea773 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Fri, 4 Mar 2022 14:03:00 +0800
Subject: [PATCH 119/272] add communication  api for ProcessGroupGloo (#40100)

* add pg_gloo apis
---
 .../collective/ProcessGroupGloo.cc            | 194 ++++++++++++++++++
 .../distributed/collective/ProcessGroupGloo.h |  14 ++
 paddle/fluid/distributed/store/tcp_store.cc   |  68 +++---
 .../tests/unittests/process_group_gloo.py     |  83 +++++++-
 4 files changed, 327 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 03ad48f560a..5dc43af1178 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -25,6 +25,8 @@
 #endif
 
 #include <gloo/broadcast.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -144,6 +146,22 @@ void set_inputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
   opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
 }
 
+template <typename T, typename P>
+void set_inputs_for_scatter(P& opts,                             // NOLINT
+                            const std::vector<Tensor>& tensors,  // NOLINT
+                            int nranks) {
+  std::vector<T*> ret(nranks);
+  auto raw_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensors[0].impl());
+  T* raw_pointer = reinterpret_cast<T*>(raw_tensor->data());
+  size_t offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    ret[i] = raw_pointer + offset;
+    offset += tensors[0].numel() / nranks;
+  }
+  opts.setInputs(ret, tensors[0].numel() / nranks);
+}
+
 ProcessGroupGloo::GlooTask::GlooTask(int rank,
                                      const std::vector<Tensor>& inputs,
                                      CommType comm_type)
@@ -257,6 +275,182 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
   return task;
 }
 
+class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
+      : ProcessGroupGloo::GlooTask(rank, std::vector<Tensor>{},
+                                   CommType::BARRIER),
+        _context(context) {}
+
+  void Run() override { _do_barrier(); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+
+  void _do_barrier() {
+    gloo::BarrierOptions opts(_context);
+    gloo::barrier(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
+    const BarrierOptions& opts) {
+  std::shared_ptr<BarrierGlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<BarrierGlooTask>(rank_, context);
+  task->Run();
+  return task;
+}
+
+class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllgatherGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs,   // NOLINT
+                    std::vector<Tensor>& outputs,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+
+  void Run() override { _do_allgather(_inputs, _outputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  std::vector<Tensor> _outputs;
+  uint32_t _tag;
+
+  void _do_allgather(std::vector<Tensor>& in,     // NOLINT
+                     std::vector<Tensor>& out) {  // NOLINT
+    const auto& dtype = in[0].type();
+    gloo::AllgatherOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, in[0]);
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setTag(_tag);
+    gloo::allgather(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  std::shared_ptr<AllgatherGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<AllgatherGlooTask>(rank_, context, in_tensors,
+                                             out_tensors, tag);
+  task->Run();
+  return task;
+}
+
+class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ReduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                 std::vector<Tensor>& in, ReduceOp reduce_op,  // NOLINT
+                 int dst, uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE),
+        _context(context),
+        _inputs(in),
+        _reduce_op(reduce_op),
+        _dst(dst),
+        _tag(tag) {}
+
+  void Run() override { _do_reduce(_inputs, _dst); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  int _dst;
+  uint32_t _tag;
+
+  gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
+                                          const ReduceOp op) {
+    gloo::ReduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::ReduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_reduce(std::vector<Tensor>& tensors, int dst) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::ReduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, tensors[0]);
+    GENERATE_FUNC(dtype, set_output, opts, tensors[0]);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    opts.setRoot(dst);
+    gloo::reduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  std::shared_ptr<ReduceGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ReduceGlooTask>(rank_, context, tensors,
+                                          opts.reduce_op, opts.root_rank, tag);
+  task->Run();
+  return task;
+}
+
+class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ScatterGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                  std::vector<Tensor>& inputs,   // NOLINT
+                  std::vector<Tensor>& outputs,  // NOLINT
+                  int src, int size, uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _src(src),
+        _size(size),
+        _tag(tag) {}
+
+  void Run() override { _do_scatter(_inputs, _outputs, _src); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  std::vector<Tensor> _outputs;
+  int _src;
+  int _size;
+  uint32_t _tag;
+
+  void _do_scatter(std::vector<Tensor>& in, std::vector<Tensor>& out,  // NOLINT
+                   int src) {
+    const auto& dtype = in[0].type();
+    gloo::ScatterOptions opts(_context);
+    if (rank_ == src) {
+      GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setRoot(src);
+    opts.setTag(_tag);
+    gloo::scatter(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  std::shared_ptr<ScatterGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ScatterGlooTask>(
+      rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
+  task->Run();
+  return task;
+}
+
 std::shared_ptr<::gloo::transport::Device>
 ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
   ::gloo::transport::tcp::attr attr;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index d989939fcb8..24f156571a4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -114,6 +114,20 @@ class ProcessGroupGloo : public ProcessGroup {
       std::vector<Tensor>& inputs,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
   std::shared_ptr<::gloo::Context> get_context() { return _context; }
   uint64_t next_tag() { return _tag++; }
 
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index 8675981955d..eb98c89c99e 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -74,6 +74,7 @@ void MasterDaemon::_do_set(SocketType socket) {
 }
 
 void MasterDaemon::_do_get(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_get";
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
   PADDLE_ENFORCE_NE(
@@ -86,13 +87,14 @@ void MasterDaemon::_do_get(SocketType socket) {
 void MasterDaemon::_do_stop(SocketType socket) {
   VLOG(3) << "MasterDaemon::_do_stop";
   ReplyType value = ReplyType::STOP_WAIT;
+  tcputils::send_value<ReplyType>(socket, value);
   if (--_nranks == 0) {
     _stop = true;
   }
-  tcputils::send_value<ReplyType>(socket, value);
 }
 
 void MasterDaemon::_do_wait(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_wait";
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
   auto reply = ReplyType::STOP_WAIT;
@@ -134,32 +136,42 @@ void MasterDaemon::run() {
     }
 
     for (size_t i = 1; i < fds.size(); i++) {
-      if (fds[i].revents == 0) {
-        continue;
-      }
-
-      Command command = tcputils::receive_value<Command>(fds[i].fd);
-      VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
-
-      switch (command) {
-        case Command::ADD:
-          _do_add(fds[i].fd);
-          break;
-        case Command::GET:
-          _do_get(fds[i].fd);
-          break;
-        case Command::SET:
-          _do_set(fds[i].fd);
-          break;
-        case Command::WAIT:
-          _do_wait(fds[i].fd);
-          break;
-        case Command::STOP:
-          _do_stop(fds[i].fd);
-          break;
-        default:
-          VLOG(0) << "Unknow command: " << static_cast<int>(command);
-          exit(-1);
+      VLOG(0) << "fds.size:" << fds.size();
+      VLOG(0) << "fds.size-i:" << i;
+      VLOG(0) << "fds[i].revents:" << fds[i].revents;
+
+      try {
+        if (fds[i].revents == 0) {
+          continue;
+        }
+
+        Command command = tcputils::receive_value<Command>(fds[i].fd);
+        VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command)
+                << ".";
+
+        switch (command) {
+          case Command::ADD:
+            _do_add(fds[i].fd);
+            break;
+          case Command::GET:
+            _do_get(fds[i].fd);
+            break;
+          case Command::SET:
+            _do_set(fds[i].fd);
+            break;
+          case Command::WAIT:
+            _do_wait(fds[i].fd);
+            break;
+          case Command::STOP:
+            _do_stop(fds[i].fd);
+            break;
+          default:
+            VLOG(0) << "Unknow command: " << static_cast<int>(command);
+            exit(-1);
+        }
+      } catch (...) {
+        fds.erase(fds.begin() + i);
+        _sockets.erase(_sockets.begin() + i - 1);
       }
     }
   }
@@ -281,8 +293,8 @@ void TCPStore::wait(const std::string& key) {
 }
 
 TCPStore::~TCPStore() {
-  _client->send_command_for_key(Command::STOP, "");
   VLOG(3) << "~TCPStore";
+  _client->send_command_for_key(Command::STOP, "");
   ReplyType ret = _client->receive_value<ReplyType>();
   PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
                     platform::errors::InvalidArgument(
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
index 5420e1d36b3..c62c4615f74 100644
--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -104,16 +104,91 @@ class TestProcessGroupFp32(unittest.TestCase):
             broadcast_result = paddle.assign(tensor_x)
             if rank == 0:
                 task = pg.broadcast(tensor_x, 0)
-                task.synchronize()
-                assert task.is_completed()
                 assert np.array_equal(broadcast_result, tensor_x)
             else:
                 task = pg.broadcast(tensor_y, 0)
-                task.synchronize()
-                assert task.is_completed()
                 assert np.array_equal(broadcast_result, tensor_y)
             print("test broadcast api ok")
 
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 28fd30cda4f2b39fdf770fed22a67f8db5130979 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 4 Mar 2022 14:04:39 +0800
Subject: [PATCH 120/272] [Phi] Remove cholsky solve deps with svd helper
 (#40119)

* remove cholsky solve deps with svd helper

* fix shape infer bug
---
 paddle/fluid/operators/cholesky_solve_op.h | 20 ++++----
 paddle/phi/infermeta/unary.cc              | 53 ++++++++++++++++++++++
 paddle/phi/infermeta/unary.h               |  4 ++
 paddle/phi/kernels/transpose_kernel.h      | 25 ++++++++++
 4 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index f25fbbb0c69..74b961d4e55 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace paddle {
 namespace operators {  // namespace operators
@@ -59,7 +59,9 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
   framework::Tensor b_bst(bin.type());
   TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
 
-  math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
+  auto &phi_dev_ctx = static_cast<
+      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE &>(
+      dev_ctx);
 
   // calculate u's conjugate for complex
   framework::Tensor u_conj(u_bst.type());
@@ -68,7 +70,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
       u_bst.data<T>(), u_bst.numel(),
       u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
   u_for_range(u_functor);
-  u_conj = helper.Transpose(u_conj);
+  u_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, u_conj);
 
   // calculate b's conjugate for complex
   framework::Tensor b_conj(b_bst.type());
@@ -77,7 +79,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
       b_bst.data<T>(), b_bst.numel(),
       b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
   b_for_range(b_functor);
-  b_conj = helper.Transpose(b_conj);
+  b_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, b_conj);
 
   auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
   auto uindims = u_bst.dims();
@@ -117,7 +119,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
       out->data<T>(), out->numel(),
       out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
   out_for_range(out_functor);
-  *out = helper.Transpose(*out);
+  *out = phi::TransposeLast2Dim<T>(phi_dev_ctx, *out);
 }
 
 template <typename DeviceContext, typename T>
@@ -145,7 +147,9 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
     auto upper = ctx.Attr<bool>("upper");
 
     const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
+    auto &phi_dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE &>(
+        dev_ctx);
 
     std::vector<int64_t> u_bst_dims_vec;
     std::vector<int64_t> b_bst_dims_vec;
@@ -177,7 +181,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
           out->data<T>(), out->numel(),
           out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
       out_for_range(out_functor);
-      out_conj = helper.Transpose(out_conj);
+      out_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, out_conj);
 
       framework::Tensor commonterm(out->type());
       auto outdims = out_conj.dims();
@@ -200,7 +204,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
           commonterm_conj.mutable_data<T>(commonterm.dims(),
                                           dev_ctx.GetPlace()));
       commonterm_for_range(commonterm_functor);
-      commonterm_conj = helper.Transpose(commonterm_conj);
+      commonterm_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, commonterm_conj);
 
       phi::AddRawKernel<T>(
           static_cast<const typename paddle::framework::ConvertToPhiContext<
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index fbd9259a83f..ed95c8ff677 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -962,6 +962,59 @@ void PixelShuffleInferMeta(const MetaTensor& x,
   out->set_dims(output_dims);
 }
 
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out) {
+  auto x_dims = x.dims();
+  size_t x_rank = x_dims.size();
+  size_t axis_size = axis.size();
+
+  PADDLE_ENFORCE_EQ(
+      x_rank,
+      axis_size,
+      errors::InvalidArgument("The input tensor's dimension "
+                              "should be equal to the axis's size. "
+                              "But received input tensor's dimension is %d, "
+                              "axis's size is %d",
+                              x_rank,
+                              axis_size));
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_ENFORCE_GE(
+        axis[i],
+        0,
+        errors::InvalidArgument("The axis should be greater than or equal to 0."
+                                "But received %d of axis[%d]",
+                                axis[i],
+                                i));
+
+    PADDLE_ENFORCE_EQ(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        true,
+        errors::InvalidArgument(
+            "Each element of Attribute axis should "
+            "be a unique value range from 0 to (dims - 1), "
+            "where the dims is the axis's size, "
+            "unique value means this axis value can appear only once. "
+            "But received axis[%d] is %d, axis_size is %d, "
+            "count[axis[%d]] is %d",
+            i,
+            axis[i],
+            axis_size,
+            i,
+            count[axis[i]]));
+  }
+
+  phi::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; ++i) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 3c0628981af..97ec6f7fa58 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -145,4 +145,8 @@ void PixelShuffleInferMeta(const MetaTensor& x,
                            const std::string& data_format,
                            MetaTensor* out);
 
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 303b4a9a8f0..3d89b324bab 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -15,7 +15,10 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
@@ -25,4 +28,26 @@ void TransposeKernel(const Context& dev_ctx,
                      const std::vector<int>& axis,
                      DenseTensor* out);
 
+template <typename T, typename Context>
+DenseTensor Transpose(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int>& axis) {
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  TransposeInferMeta(x, axis, &meta_out);
+  TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor TransposeLast2Dim(const Context& dev_ctx, const DenseTensor& x) {
+  size_t rank = x.dims().size();
+  std::vector<int> axis(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    axis[i] = i;
+  }
+  std::swap(axis[rank - 1], axis[rank - 2]);
+  return Transpose<T, Context>(dev_ctx, x, axis);
+}
+
 }  // namespace phi
-- 
GitLab


From 8dbfc2aec8427dc414e3f129d4057b1f2fbef0d9 Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Fri, 4 Mar 2022 14:09:17 +0800
Subject: [PATCH 121/272] [paddle-inference]support setting fully connected in
 multi-head attention static shape branch to int8  (#39660)

* fix inference int

* update

* add unittest
---
 .../tensorrt/convert/multihead_matmul_op.cc   |  32 +-
 .../test_trt_convert_multihead_matmul.py      | 384 ++++++++++++++++++
 2 files changed, 413 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index a432ff62810..f19b21d3e63 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -335,15 +335,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
         reshape_before_fc_dim.d[4] = 1;
         auto* reshape_before_fc_layer =
             TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        if (enable_int8) {
+          engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
+                                         in_scale);
+        }
         reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
         reshape_before_fc_layer->setName(
             ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
                 .c_str());
 
         // add layer fc
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
-            weight.get(), bias.get());
+        nvinfer1::ILayer* fc_layer = nullptr;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Convolution, *reshape_before_fc_layer->getOutput(0), n,
+              nv_ksize, weight.get(), bias.get());
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+              n, weight.get(), bias.get());
+        }
+
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("fc_out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+        }
         fc_layer->setName(
             ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
 
@@ -359,6 +381,10 @@ class MultiheadMatMulOpConverter : public OpConverter {
         plugin_inputs.push_back(input_bias_qk);
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+
+        if (enable_int8) {
+          with_fp16 = 1;
+        }
         plugin::DynamicPluginTensorRT* plugin =
             new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
                                                   head_size, scale, with_fp16);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index 2d2072d277e..97a94ef348a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -451,10 +451,394 @@ class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
             "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2."
         )
 
+        def teller3(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt in int8 mode.")
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
 
 
+class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
+    def sample_program_configs(self):
+        def generate_input1(batch, dim1):
+            return np.random.random((batch, dim1, 768)).astype(np.float32)
+
+        def generate_input2(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight1():
+            return np.random.random((768, 768)).astype(np.float32)
+
+        def generate_weight2():
+            return np.random.random(768).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            self.batch = batch
+            for reshape_shape in [[0, 0, 12, 64]]:
+                for dim1 in [128]:
+                    input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
+                                     [batch, 1, 1, dim1]]
+                    for input2_shape in input2_shapes:
+                        for axis in [0]:
+                            dics = [{
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "scale": 0.125,
+                                "bias": 0.0,
+                                "bias_after_scale": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": True,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": axis
+                            }, {
+                                "axis": -1,
+                                "is_test": True
+                            }, {
+                                "seed": 0,
+                                "dropout_prob": 0.10000000149011612,
+                                "dropout_implementation": "upscale_in_train",
+                                "fix_seed": False,
+                                "is_test": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": False,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "shape": [0, 0, 768]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }]
+
+                            ops_config = [
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul1_output"]
+                                    },
+                                    "op_attrs": dics[0]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul1_output"],
+                                        "Y": ["elementwise_add1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add1_output"]
+                                    },
+                                    "op_attrs": dics[1]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add1_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape21_output"],
+                                        "XShape": ["reshape21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[2]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape21_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose21_output"],
+                                        "XShape":
+                                        ["transpose21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[3]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul2_output"]
+                                    },
+                                    "op_attrs": dics[4]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul2_output"],
+                                        "Y": ["elementwise_add2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add2_output"]
+                                    },
+                                    "op_attrs": dics[5]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape22_output"],
+                                        "XShape": ["reshape22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[6]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape22_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose22_output"],
+                                        "XShape":
+                                        ["transpose22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[7]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul3_output"]
+                                    },
+                                    "op_attrs": dics[8]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul3_output"],
+                                        "Y": ["elementwise_add3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add3_output"]
+                                    },
+                                    "op_attrs": dics[9]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add3_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape23_output"],
+                                        "XShape": ["reshape23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[10]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape23_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose23_output"],
+                                        "XShape":
+                                        ["transpose23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[11]
+                                },
+                                {
+                                    "op_type": "scale",
+                                    "op_inputs": {
+                                        "X": ["transpose23_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["scale_output"]
+                                    },
+                                    "op_attrs": dics[12]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["scale_output"],
+                                        "Y": ["transpose22_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul1_output"]
+                                    },
+                                    "op_attrs": dics[13]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["matmul1_output"],
+                                        "Y": ["input_data2"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add4_output"]
+                                    },
+                                    "op_attrs": dics[14]
+                                },
+                                {
+                                    "op_type": "softmax",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add4_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["softmax_output"]
+                                    },
+                                    "op_attrs": dics[15]
+                                },
+                                {
+                                    "op_type": "dropout",
+                                    "op_inputs": {
+                                        "X": ["softmax_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["dropout3_output"]
+                                    },
+                                    "op_attrs": dics[16]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["dropout3_output"],
+                                        "Y": ["transpose21_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul2_output"]
+                                    },
+                                    "op_attrs": dics[17]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["matmul2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose24_output"],
+                                        "XShape":
+                                        ["transpose24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[18]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["transpose24_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape24_output"],
+                                        "XShape": ["reshape24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[19]
+                                },
+                                # In order to fuse ops with 
+                                # multihead_matmul_fuse_pass_v2, the last op
+                                # must be mul.
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["reshape24_output"],
+                                        "Y": ["mul4_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul4_output"]
+                                    },
+                                    "op_attrs": dics[20]
+                                }
+                            ]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={
+                                    "mul1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul4_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "elementwise_add1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                },
+                                inputs={
+                                    "input_data1": TensorConfig(
+                                        data_gen=partial(generate_input1, batch,
+                                                         dim1)),
+                                    "input_data2": TensorConfig(
+                                        data_gen=partial(generate_input2,
+                                                         input2_shape)),
+                                },
+                                outputs=["mul4_output"])
+
+                            yield program_config
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 45385371ab0171034dcba0e26db118cc777282b7 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 4 Mar 2022 14:17:23 +0800
Subject: [PATCH 122/272] Fix bug caused by split infershape (#40116)

* fix bug caused by split infershape

* revert infer_shape of split

* revert split
---
 paddle/fluid/operators/split_op.cc     | 52 +++++++++++++++--
 paddle/phi/infermeta/unary.cc          | 80 ++++++++++++--------------
 paddle/phi/kernels/cpu/split_kernel.cc | 17 ++++++
 paddle/phi/kernels/gpu/split_kernel.cu | 17 ++++++
 4 files changed, 117 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 6678320f9ff..5b8922505cc 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -26,6 +26,52 @@ class SplitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of SplitOp should not be null."));
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Outputs(Out) of SplitOp should not be empty."));
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+
+    if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(
+          sections.size(), outs_number,
+          platform::errors::InvalidArgument("tensor split sections size "
+                                            "should be equal to output size."));
+    }
+
+    if (ctx->HasInput("AxisTensor")) {
+      auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
+      std::vector<framework::DDim> outs_dims(outs_number, out_dims);
+      ctx->SetOutputsDim("Out", outs_dims);
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+      return;
+    }
+
+    bool each_section_is_known =
+        (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
+
+    auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
+                                    in_dims, num, sections, axis, outs_number);
+    ctx->SetOutputsDim("Out", outs_dims);
+    if (axis != 0) {
+      // Only pass LoD when not spliting along the first dim.
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+    }
+  }
+
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -125,10 +171,6 @@ Example:
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
-                            PT_INFER_META(phi::SplitInferMeta));
-
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
                   ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>,
-                  SplitInferShapeFunctor);
+                  ops::SplitGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index ed95c8ff677..ff58c53ad9b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -508,17 +508,6 @@ void SplitInferMeta(const MetaTensor& x,
                     const Scalar& axis,
                     std::vector<MetaTensor*> out,
                     MetaConfig config) {
-  if (!config.is_runtime) {
-    if (axis.FromTensor() || num_or_sections.FromTensor()) {
-      auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
-      for (auto* item : out) {
-        item->set_dims(out_dims);
-        item->share_lod(x);
-      }
-      return;
-    }
-  }
-
   int axis_value = axis.to<int>();
   int rank = x.dims().size();
   PADDLE_ENFORCE_EQ(
@@ -533,34 +522,27 @@ void SplitInferMeta(const MetaTensor& x,
     axis_value = axis_value + rank;
   }
 
-  std::vector<phi::DDim> out_dims(out.size(), x.dims());
-
   auto input_axis_dim = x.dims().at(axis_value);
   auto num_or_sections_data = num_or_sections.GetData();
+  // step1: get formated sections
+  std::vector<int64_t> sections;
   // num_or_sections is a number
   if (num_or_sections_data.size() == 1) {
-    if (config.is_runtime || input_axis_dim > 0) {
-      int num = num_or_sections_data.at(0);
-      PADDLE_ENFORCE_EQ(
-          input_axis_dim % num,
-          0,
-          phi::errors::InvalidArgument(
-              "The input's size along the split dimension "
-              "must be evenly divisible by Attr(num_or_sections). "
-              "But received Attr(num_or_sections) "
-              "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
-              num,
-              x.dims(),
-              axis_value));
+    int num = num_or_sections_data.at(0);
 
-      size_t out_axis_dim = input_axis_dim / num;
-      for (auto& out_dim : out_dims) {
-        out_dim[axis_value] = out_axis_dim;
-      }
-    } else {
-      for (auto& out_dim : out_dims) {
-        out_dim[axis_value] = -1;
-      }
+    PADDLE_ENFORCE_EQ(input_axis_dim % num,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The input's size along the split dimension "
+                          "must be evenly divisible by Attr(num_or_sections). "
+                          "But received Attr(num_or_sections) "
+                          "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
+                          num,
+                          x.dims(),
+                          axis_value));
+
+    for (int i = 0; i < num; ++i) {
+      sections.push_back(input_axis_dim / num);
     }
   } else {
     // num_or_sections is a sections
@@ -568,9 +550,10 @@ void SplitInferMeta(const MetaTensor& x,
     int unknow_dim_idx = -1;
     int num_of_unknow = 0;
     int sum_of_section = 0;
-    std::vector<int64_t> sections = num_or_sections_data;
 
     for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
+      sections.push_back(num_or_sections_data[i]);
+
       if (num_or_sections_data[i] == unknow_dim_val) {
         num_of_unknow++;
         unknow_dim_idx = i;
@@ -622,22 +605,31 @@ void SplitInferMeta(const MetaTensor& x,
               x.dims(),
               axis_value));
     }
-    for (size_t i = 0; i < out_dims.size(); ++i) {
+  }
+
+  // setp2: fill out dims
+  std::vector<phi::DDim> out_dims(sections.size(), x.dims());
+  if (config.is_runtime || input_axis_dim > 0) {
+    for (size_t i = 0; i < sections.size(); ++i) {
       out_dims[i][axis_value] = sections[i];
     }
+  } else {
+    for (size_t i = 0; i < sections.size(); ++i) {
+      out_dims[i][axis_value] = -1;
+    }
   }
 
-  for (size_t i = 0; i < out.size(); ++i) {
+  for (size_t i = 0; i < sections.size(); ++i) {
     if (axis_value != 0) {
       // Only pass LoD when not spliting along the first dim.
-      out.at(i)->set_dtype(x.dtype());
-      out.at(i)->set_dims(out_dims[i]);
-      out.at(i)->set_layout(x.layout());
+      out[i]->set_dtype(x.dtype());
+      out[i]->set_dims(out_dims[i]);
+      out[i]->set_layout(x.layout());
     } else {
-      out.at(i)->set_dtype(x.dtype());
-      out.at(i)->set_dims(out_dims[i]);
-      out.at(i)->set_layout(x.layout());
-      out.at(i)->share_lod(x);
+      out[i]->set_dtype(x.dtype());
+      out[i]->set_dims(out_dims[i]);
+      out[i]->set_layout(x.layout());
+      out[i]->share_lod(x);
     }
   }
 }
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 4acf9b02028..324798effbe 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -28,6 +28,23 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
+  // need to infershape output
+  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
+    std::vector<MetaTensor> out_metas;
+    out_metas.reserve(outs.size());
+    std::vector<MetaTensor*> out_metas_ptr;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      out_metas.push_back(outs[i]);
+      out_metas_ptr.push_back(&out_metas.back());
+    }
+
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
+
+    for (size_t i = 0; i < out_metas.size(); ++i) {
+      outs[i]->Resize(out_metas[i].dims());
+    }
+  }
+
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index d2473d5b0b1..c28fc3794f0 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -27,6 +27,23 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
+  // need to infershape output
+  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
+    std::vector<MetaTensor> out_metas;
+    out_metas.reserve(outs.size());
+    std::vector<MetaTensor*> out_metas_ptr;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      out_metas.push_back(outs[i]);
+      out_metas_ptr.push_back(&out_metas.back());
+    }
+
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
+
+    for (size_t i = 0; i < out_metas.size(); ++i) {
+      outs[i]->Resize(out_metas[i].dims());
+    }
+  }
+
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
-- 
GitLab


From f3161c507b1f9c729cb24fe0dc5b6b7f7f5e1e6c Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 4 Mar 2022 14:26:45 +0800
Subject: [PATCH 123/272] [PHI] Remove emtpy kernel and infershape in fluid
 (#40146)

* remove emtpy kernel and infershape in fluid

* fix bug of infershape_utils
---
 paddle/fluid/framework/infershape_utils.cc |  4 ++
 paddle/fluid/operators/empty_op.cc         | 57 +++-------------------
 paddle/fluid/operators/empty_op.cu.cc      | 26 ----------
 paddle/fluid/operators/empty_op.h          | 47 ------------------
 4 files changed, 12 insertions(+), 122 deletions(-)
 delete mode 100644 paddle/fluid/operators/empty_op.cu.cc
 delete mode 100644 paddle/fluid/operators/empty_op.h

diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 57fb68e8042..7232a707916 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -381,6 +381,10 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             std::type_index(typeid(std::vector<int32_t>))) {
           infer_meta_context.EmplaceBackAttr(std::move(
               phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int64_t>))) {
+          infer_meta_context.EmplaceBackAttr(std::move(
+              phi::ScalarArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
         } else if (std::type_index(attr.type()) ==
                    std::type_index(typeid(int))) {
           infer_meta_context.EmplaceBackAttr(
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index e23342ebb5d..6baa504562e 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -51,46 +53,6 @@ class EmptyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* context) const override {
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
-
-    if (context->HasInput("ShapeTensor")) {
-      auto shape_dims = context->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      context->SetOutputDim("Out", phi::make_ddim(vec_dims));
-    } else if (context->HasInputs("ShapeTensorList")) {
-      std::vector<int> out_dims;
-      auto dims_list = context->GetInputsDim("ShapeTensorList");
-      for (size_t i = 0; i < dims_list.size(); ++i) {
-        auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(dims, phi::make_ddim({1}),
-                          platform::errors::InvalidArgument(
-                              "The shape of Tensor in list must be [1]. "
-                              "But received the shape is [%s]",
-                              dims));
-
-        out_dims.push_back(-1);
-      }
-
-      context->SetOutputDim("Out", phi::make_ddim(out_dims));
-    } else {
-      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
-      for (size_t i = 0; i < shape.size(); ++i) {
-        PADDLE_ENFORCE_GE(
-            shape[i], 0,
-            platform::errors::InvalidArgument(
-                "Each value of attribute 'shape' is expected to be no less "
-                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
-                i, shape[i], phi::make_ddim(shape)));
-      }
-      context->SetOutputDim("Out", phi::make_ddim(shape));
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const framework::Tensor& tensor,
@@ -126,14 +88,11 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
+                            PT_INFER_META(phi::CreateInferMeta));
+
 REGISTER_OPERATOR(
     empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    EmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc
deleted file mode 100644
index 22799e507ae..00000000000
--- a/paddle/fluid/operators/empty_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/empty_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
-    ops::EmptyKernel<plat::CUDADeviceContext, int>,
-    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
-    ops::EmptyKernel<plat::CUDADeviceContext, float>,
-    ops::EmptyKernel<plat::CUDADeviceContext, double>,
-    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
deleted file mode 100644
index cb466fffcd7..00000000000
--- a/paddle/fluid/operators/empty_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EmptyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor *out_tensor = context.Output<Tensor>("Out");
-
-    auto shape = GetShape(context);
-    out_tensor->Resize(shape);
-
-    out_tensor->mutable_data(context.GetPlace(),
-                             framework::TransToPhiDataType(dtype));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-- 
GitLab


From 0bfba16b66ca5c496760b01ffac56c788444decb Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 4 Mar 2022 14:51:57 +0800
Subject: [PATCH 124/272] Add digamma abs trunc yaml (#40024)

* add digamma, abs, trunc; test=develop

* fix bug and add diagonal; test=develop

* add name coverter; test=develop

* update tracer.py; test=develop

* add test case; test=develop

* fix bugs; test=develop
---
 paddle/fluid/operators/diagonal_op.cc         | 77 ++-----------------
 paddle/phi/infermeta/backward.h               |  1 +
 paddle/phi/infermeta/unary.cc                 | 75 ++++++++++++++++++
 paddle/phi/infermeta/unary.h                  |  3 +
 paddle/phi/kernels/cpu/norm_grad_kernel.cc    |  2 +-
 paddle/phi/kernels/digamma_grad_kernel.h      |  2 +-
 paddle/phi/kernels/gpu/norm_grad_kernel.cu    |  2 +-
 .../kernels/impl/digamma_grad_kernel_impl.h   |  2 +-
 paddle/phi/kernels/norm_grad_kernel.h         |  2 +-
 paddle/phi/ops/compat/digamma_sig.cc          |  2 +-
 paddle/phi/ops/compat/norm_sig.cc             |  2 +-
 python/paddle/fluid/dygraph/tracer.py         | 23 ++++++
 .../fluid/layers/layer_function_generator.py  |  2 +-
 .../tests/unittests/test_activation_op.py     |  2 +-
 .../fluid/tests/unittests/test_diagonal_op.py | 19 +++++
 .../fluid/tests/unittests/test_trunc_op.py    | 10 +++
 python/paddle/tensor/math.py                  |  8 +-
 python/paddle/utils/code_gen/api.yaml         | 46 +++++++++++
 python/paddle/utils/code_gen/backward.yaml    | 55 +++++++++++++
 19 files changed, 256 insertions(+), 79 deletions(-)

diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc
index b419f629a1e..20813f8bb44 100644
--- a/paddle/fluid/operators/diagonal_op.cc
+++ b/paddle/fluid/operators/diagonal_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,74 +23,6 @@ namespace operators {
 class DiagonalOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal");
-
-    int offset_ = ctx->Attrs().Get<int>("offset");
-    int axis1 = ctx->Attrs().Get<int>("axis1");
-    int axis2 = ctx->Attrs().Get<int>("axis2");
-
-    auto x_dims = ctx->GetInputDim("Input");
-    int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
-    int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
-
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::OutOfRange("Input's dim is out of range (expected at "
-                                     "least 2 dimensions, but got %ld).",
-                                     x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis1_, x_dims.size(),
-        platform::errors::OutOfRange(
-            "Attr(axis1) is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size()), (x_dims.size() - 1), axis1));
-    PADDLE_ENFORCE_LT(
-        axis2_, x_dims.size(),
-        platform::errors::OutOfRange(
-            "Attr(axis2) is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size()), (x_dims.size() - 1), axis2));
-    PADDLE_ENFORCE_NE(axis1_, axis2_,
-                      platform::errors::InvalidArgument(
-                          "The dimensions should not be identical "
-                          "%d vs %d.",
-                          axis1, axis2));
-
-    auto out_dims = vectorize(x_dims);
-    // from out_dims get the dim size of axis1_.
-    auto axis1_size = out_dims[axis1_];
-    auto axis2_size = out_dims[axis2_];
-    // delete two dims by attr axis1 and axis2 from out_dims.
-    /* example:
-       out_dim = [2, 3, 4];
-       axis1 = 0;
-       axis2 = 1;
-       according to the attr of axis1 and axis2, we get:
-       out_dim = [4].
-    */
-    out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
-    out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
-
-    if (offset_ == 0) {
-      out_dims.push_back(std::min(axis1_size, axis2_size));
-    } else if (offset_ > 0) {
-      if ((axis2_size - offset_) > 0) {
-        out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
-      } else {
-        out_dims.push_back(0);
-      }
-    } else {
-      if ((axis1_size + offset_) > 0) {
-        out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
-      } else {
-        out_dims.push_back(0);
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-  }
 };
 
 class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -170,9 +105,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
+                            PT_INFER_META(phi::DiagonalInferMeta));
+
 REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker,
                   ops::DiagonalGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>,
+                  DiagonalInferShapeFunctor);
 
 REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp,
                   ops::DiagonalGradNoNeedBufferVarsInferer)
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c7090ed664b..f2c0cf8a689 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <tuple>
 
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index ff58c53ad9b..85db1547f16 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -706,6 +706,81 @@ void TraceInferMeta(
   out->set_dims(phi::make_ddim(sizes));
 }
 
+void DiagonalInferMeta(const MetaTensor& input,
+                       int offset,
+                       int axis1,
+                       int axis2,
+                       MetaTensor* out) {
+  auto x_dims = input.dims();
+  int offset_ = offset;
+  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
+  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::OutOfRange("Input's dim is out of range (expected at "
+                              "least 2 dimensions, but got %ld).",
+                              x_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis1_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis1) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis1));
+  PADDLE_ENFORCE_LT(
+      axis2_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis2) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis2));
+  PADDLE_ENFORCE_NE(
+      axis1_,
+      axis2_,
+      phi::errors::InvalidArgument("The dimensions should not be identical "
+                                   "%d vs %d.",
+                                   axis1,
+                                   axis2));
+
+  auto out_dims = vectorize(x_dims);
+  // from out_dims get the dim size of axis1_.
+  auto axis1_size = out_dims[axis1_];
+  auto axis2_size = out_dims[axis2_];
+  // delete two dims by attr axis1 and axis2 from out_dims.
+  /* example:
+     out_dim = [2, 3, 4];
+     axis1 = 0;
+     axis2 = 1;
+     according to the attr of axis1 and axis2, we get:
+     out_dim = [4].
+  */
+  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
+  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+
+  if (offset_ == 0) {
+    out_dims.push_back(std::min(axis1_size, axis2_size));
+  } else if (offset_ > 0) {
+    if ((axis2_size - offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
+    } else {
+      out_dims.push_back(0);
+    }
+  } else {
+    if ((axis1_size + offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
+    } else {
+      out_dims.push_back(0);
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
 void UnfoldInferMeta(const MetaTensor& x,
                      const std::vector<int>& kernel_sizes,
                      const std::vector<int>& strides,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 97ec6f7fa58..d4e21fbd824 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -140,6 +140,9 @@ void DiagInferMeta(const MetaTensor& x,
 
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
 
+void DiagonalInferMeta(
+    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index 597207a05a2..bd05e2c4c6e 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -26,9 +26,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& norm,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index 38912a5ccc4..ae5346080d3 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -20,8 +20,8 @@ namespace phi {
 
 template <typename T, typename Context>
 void DigammaGradKernel(const Context& ctx,
-                       const DenseTensor& out_grad,
                        const DenseTensor& x,
+                       const DenseTensor& out_grad,
                        DenseTensor* x_grad);
 
 }  // namepsace phi
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index ab38a82eceb..43a08b0603e 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -75,9 +75,9 @@ __global__ void NormalizeGradient(const T* x,
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& norm,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index 74ded1569eb..92550de1800 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -38,8 +38,8 @@ struct DigammaGradFunctor {
 
 template <typename T, typename Context>
 void DigammaGradKernel(const Context& ctx,
-                       const DenseTensor& out_grad,
                        const DenseTensor& x,
+                       const DenseTensor& out_grad,
                        DenseTensor* x_grad) {
   x_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/phi/kernels/norm_grad_kernel.h b/paddle/phi/kernels/norm_grad_kernel.h
index 7b09d6463d0..55714b8a4a0 100644
--- a/paddle/phi/kernels/norm_grad_kernel.h
+++ b/paddle/phi/kernels/norm_grad_kernel.h
@@ -20,9 +20,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& out,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc
index fa693f92c6f..12ef3056f1e 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/ops/compat/digamma_sig.cc
@@ -19,7 +19,7 @@ namespace phi {
 KernelSignature DigammaGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "digamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+      "digamma_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc
index 81d294b8424..a74db9b5686 100644
--- a/paddle/phi/ops/compat/norm_sig.cc
+++ b/paddle/phi/ops/compat/norm_sig.cc
@@ -23,7 +23,7 @@ KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("norm_grad",
-                         {GradVarName("Out"), "X", "Norm"},
+                         {"X", "Norm", GradVarName("Out")},
                          {"axis", "epsilon", "is_test"},
                          {GradVarName("X")});
 }
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index e0c594b07ae..563cd433910 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -29,6 +29,29 @@ final_state_name_mapping = {
         "x": "X",
         "y": "Y",
         "out": "Out",
+    },
+    "trunc": {
+        "final_op_name": "final_state_trunc",
+        "x": "X",
+        "out": "Out",
+    },
+    "abs": {
+        "final_op_name": "final_state_abs",
+        "x": "X",
+        "out": "Out",
+    },
+    "digamma": {
+        "final_op_name": "final_state_digamma",
+        "x": "X",
+        "out": "Out",
+    },
+    "diagonal": {
+        "final_op_name": "final_state_diagonal",
+        "x": "Input",
+        "offset": "offset",
+        "axis1": "axis1",
+        "axis2": "axis2",
+        "out": "Out",
     }
 }
 
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 56af7e341fd..676ee3e3c77 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -20,7 +20,7 @@ import string
 
 from six.moves import cStringIO
 from ..proto import framework_pb2
-from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode
+from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode, _in_eager_mode
 from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype
 from paddle import _C_ops
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index d3d8fdd7031..b4b5944e27c 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -983,7 +983,7 @@ class TestAbs(TestActivation):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestCeil(TestActivation):
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index 4dab7c0df40..b4854aea52a 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -124,6 +124,25 @@ class TestDiagonalAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager(self):
+        paddle.disable_static(self.place)
+        with _test_eager_guard():
+            x_tensor = paddle.to_tensor(self.x)
+            out = paddle.diagonal(x_tensor)
+            out2 = paddle.diagonal(x_tensor, offset=0, axis1=2, axis2=1)
+            out3 = paddle.diagonal(x_tensor, offset=1, axis1=0, axis2=1)
+            out4 = paddle.diagonal(x_tensor, offset=0, axis1=1, axis2=2)
+        out_ref = np.diagonal(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
+        out2_ref = np.diagonal(self.x, offset=0, axis1=2, axis2=1)
+        self.assertEqual(np.allclose(out2.numpy(), out2_ref, rtol=1e-08), True)
+        out3_ref = np.diagonal(self.x, offset=1, axis1=0, axis2=1)
+        self.assertEqual(np.allclose(out3.numpy(), out3_ref, rtol=1e-08), True)
+        out4_ref = np.diagonal(self.x, offset=0, axis1=1, axis2=2)
+        self.assertEqual(np.allclose(out4.numpy(), out4_ref, rtol=1e-08), True)
+
+        paddle.enable_static()
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_api_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 08a35db3ac4..b70fa04adc1 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -79,6 +79,16 @@ class TestTruncAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager(self):
+        paddle.disable_static(self.place)
+
+        with _test_eager_guard():
+            x_tensor = paddle.to_tensor(self.x)
+            out = paddle.trunc(x_tensor)
+        out_ref = np.trunc(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
+        paddle.enable_static()
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_api_dygraph()
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ce29e9dce81..9a013910565 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -27,7 +27,7 @@ from paddle.tensor import cast
 from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
 from paddle.static import Variable
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
@@ -1083,6 +1083,8 @@ def trunc(input, name=None):
             #         [0., 0.]]))
     '''
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return  _C_ops.final_state_trunc(input)
         return _C_ops.trunc(input)
     else:
         inputs = {"X": input}
@@ -2425,6 +2427,8 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_diagonal(x, offset, axis1, axis2)
         return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     def __check_input(input, offset, dim1, dim2):
@@ -3184,6 +3188,8 @@ def digamma(x, name=None):
     """
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_digamma(x)
         return _C_ops.digamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma')
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 45a6aae5e6d..699e42f2373 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -193,3 +193,49 @@
   args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
+
+- api : digamma
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : digamma
+  backward : digamma_grad
+
+- api : abs
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : abs
+  backward : abs_grad
+
+- api : trunc
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : trunc
+  backward : trunc_grad
+
+# - api : norm
+#   args : (Tensor x, int axis, float epsilon, bool is_test)
+#   output : Tensor(out), Tensor(norm)
+#   infer_meta :
+#     func : NormInferMeta
+#   kernel :
+#     func : norm
+#   intermediate : norm
+#   backward : norm_grad
+
+- api : diagonal
+  args : (Tensor x, int offset, int axis1, int axis2)
+  output : Tensor
+  infer_meta :
+    func : DiagonalInferMeta
+  kernel :
+    func : diagonal
+  backward : diagonal_grad
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index cdda5cb1f05..c69bbf35b97 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -25,6 +25,61 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
+- backward_api : digamma_grad
+  forward : digamma (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : digamma_grad
+
+- backward_api : abs_grad
+  forward : abs (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : abs_grad
+
+- backward_api : trunc_grad
+  forward : trunc (Tensor x) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : trunc_grad
+
+# - backward_api : norm_grad
+#   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
+#   args : (Tensor out_grad, Tensor x, Tensor norm, int axis, float epsilon, bool is_test)
+#   output : Tensor(x_grad)
+#   infer_meta :
+#     func : UnchangedInferMeta
+#     param : [x]
+#   kernel :
+#     func : norm_grad
+
+- backward_api : diagonal_grad
+  forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : diagonal_grad
+
+# - backward_api : split_grad
+#   forward : split (Tensor x, ScalarArray num_or_sections, Scalar axis) -> Tensor[](out)
+#   args : (Tensor[] out_grad, Scalar axis)
+#   output : Tensor(x_grad)    
+#   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
 # - backward_api : matmul_triple_grad
-- 
GitLab


From b7bbe39c9fac0867e1e129e2958b33fd958d5206 Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Fri, 4 Mar 2022 15:40:03 +0800
Subject: [PATCH 125/272] [phi] move sigmoid_cross_entopy_with_logits log_loss
 cumsum auc kernel to phi (#39976)

* move sigmoid cross entopy with logits to phi

* fix ci

* move log_loss to phi

* move cumsum to phi

* revert infershape

* fix xpu ci

* move auc to phi

* remove comment

* update sigmoid_cross_entropy_with_logits_op.cu

* update sigmoid_cross_entropy_with_logits_op

* Update log_loss
---
 paddle/fluid/operators/cum_op.h               | 115 ------
 paddle/fluid/operators/cumsum_op.cc           |   7 +-
 paddle/fluid/operators/cumsum_op.cu           | 325 -----------------
 paddle/fluid/operators/cumsum_op_npu.cc       |   2 +-
 paddle/fluid/operators/log_loss_op.cc         |  12 +-
 paddle/fluid/operators/log_loss_op.h          |  74 ----
 paddle/fluid/operators/log_loss_op_npu.cc     |   2 +-
 paddle/fluid/operators/log_loss_op_xpu.cc     |   4 +-
 paddle/fluid/operators/metrics/auc_op.cc      |   3 +-
 paddle/fluid/operators/metrics/auc_op.cu      | 232 ------------
 paddle/fluid/operators/metrics/auc_op.h       | 186 ----------
 .../sigmoid_cross_entropy_with_logits_op.cc   |  14 +-
 .../sigmoid_cross_entropy_with_logits_op.cu   | 264 --------------
 .../sigmoid_cross_entropy_with_logits_op.h    | 114 ------
 ...igmoid_cross_entropy_with_logits_op_npu.cc |   3 +-
 ...igmoid_cross_entropy_with_logits_op_xpu.cc |   4 +-
 paddle/phi/kernels/auc_kernel.h               |  36 ++
 paddle/phi/kernels/cpu/auc_kernel.cc          | 190 ++++++++++
 paddle/phi/kernels/cpu/cumsum_kernel.cc       | 143 ++++++++
 .../phi/kernels/cpu/log_loss_grad_kernel.cc   |  22 ++
 paddle/phi/kernels/cpu/log_loss_kernel.cc     |  21 ++
 ...d_cross_entropy_with_logits_grad_kernel.cc |  70 ++++
 ...igmoid_cross_entropy_with_logits_kernel.cc |  71 ++++
 paddle/phi/kernels/cumsum_kernel.h            |  30 ++
 paddle/phi/kernels/gpu/auc_kernel.cu          | 258 ++++++++++++++
 paddle/phi/kernels/gpu/cumsum_kernel.cu       | 336 ++++++++++++++++++
 .../phi/kernels/gpu/log_loss_grad_kernel.cu   |  22 ++
 paddle/phi/kernels/gpu/log_loss_kernel.cu     |  21 ++
 .../gpu/sigmoid_cross_entropy_with_logits.h   |  69 ++++
 ...d_cross_entropy_with_logits_grad_kernel.cu | 126 +++++++
 ...igmoid_cross_entropy_with_logits_kernel.cu | 128 +++++++
 .../kernels/impl/log_loss_grad_kernel_impl.h  |  43 +++
 .../phi/kernels/impl/log_loss_kernel_impl.h   |  40 +++
 paddle/phi/kernels/log_loss_grad_kernel.h     |  29 ++
 paddle/phi/kernels/log_loss_kernel.h          |  28 ++
 ...id_cross_entropy_with_logits_grad_kernel.h |  30 ++
 ...sigmoid_cross_entropy_with_logits_kernel.h |  29 ++
 paddle/phi/ops/compat/log_loss_sig.cc         |  29 ++
 .../sigmoid_cross_entropy_with_logits_sig.cc  |  31 ++
 39 files changed, 1817 insertions(+), 1346 deletions(-)
 delete mode 100644 paddle/fluid/operators/cum_op.h
 delete mode 100644 paddle/fluid/operators/cumsum_op.cu
 delete mode 100644 paddle/fluid/operators/log_loss_op.h
 delete mode 100644 paddle/fluid/operators/metrics/auc_op.cu
 delete mode 100644 paddle/fluid/operators/metrics/auc_op.h
 delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
 delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
 create mode 100644 paddle/phi/kernels/auc_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/auc_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/cumsum_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/log_loss_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
 create mode 100644 paddle/phi/kernels/cumsum_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/auc_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cumsum_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/log_loss_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
 create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/log_loss_kernel_impl.h
 create mode 100644 paddle/phi/kernels/log_loss_grad_kernel.h
 create mode 100644 paddle/phi/kernels/log_loss_kernel.h
 create mode 100644 paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
 create mode 100644 paddle/phi/ops/compat/log_loss_sig.cc
 create mode 100644 paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc

diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
deleted file mode 100644
index ab3860ecafc..00000000000
--- a/paddle/fluid/operators/cum_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <array>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename Functor>
-class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = GET_DATA_SAFELY(context.Input<framework::Tensor>("X"), "Input",
-                              "X", "Cum");
-
-    auto& Out = GET_DATA_SAFELY(context.Output<framework::Tensor>("Out"),
-                                "Output", "Out", "Cum");
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto out_dims = Out.dims();
-
-    PADDLE_ENFORCE_EQ(
-        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(axis) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-            out_dims.size(), out_dims.size() - 1, axis));
-    if (axis < 0) {
-      axis += out_dims.size();
-    }
-
-    Out.template mutable_data<T>(context.GetPlace());
-
-    int pre = 1;
-    int post = 1;
-    int mid = out_dims[axis];
-    for (int i = 0; i < axis; ++i) {
-      pre *= out_dims[i];
-    }
-    for (int i = axis + 1; i < out_dims.size(); ++i) {
-      post *= out_dims[i];
-    }
-
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    using IndexT = Eigen::DenseIndex;
-    if (pre == 1) {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      }
-    } else {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      }
-    }
-  }
-
- private:
-  template <typename Device, typename Dim, typename X, typename Out>
-  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
-                  bool reverse, bool exclusive) const {
-    if (!reverse) {
-      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
-    } else {
-      std::array<bool, Dim::count> rev;
-      rev.fill(false);
-      rev[axis] = reverse;
-      out.reshape(dims).device(d) =
-          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
-    }
-  }
-};
-
-template <typename T>
-struct CumsumFunctor {
-  using ELEMENT_TYPE = T;
-  template <typename X>
-  const typename X::TensorScanSumOp operator()(X x, int axis,
-                                               bool exclusive) const {
-    return x.cumsum(axis, exclusive);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 9fa355a9246..7c80917a713 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/cum_op.h"
 
 namespace paddle {
 namespace operators {
@@ -91,11 +91,6 @@ using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
                   ops::CumsumGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int16_t>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
 
 REGISTER_OP_VERSION(cumsum)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
deleted file mode 100644
index 3402f42521f..00000000000
--- a/paddle/fluid/operators/cumsum_op.cu
+++ /dev/null
@@ -1,325 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/reverse.h>
-#include <thrust/scan.h>
-#ifdef __NVCC__
-#include <cub/cub.cuh>
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-using Tensor = paddle::framework::Tensor;
-using LoDTensor = paddle::framework::LoDTensor;
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int BLOCK_SIZE>
-__device__ void BlockReverse(const T* idata, T* odata, int src_base,
-                             int dst_base, int valid_item) {
-  __shared__ T sh_mem[BLOCK_SIZE];
-  int tx = threadIdx.x;
-
-  int offset = tx;
-  int in_index = src_base + offset;
-  if (offset >= valid_item) {
-    sh_mem[offset] = 0;
-  } else {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    T data = idata[in_index];
-    sh_mem[sh_mem_index] = data;
-  }
-
-  __syncthreads();
-  int out_index = dst_base - offset;
-  if (offset < valid_item) {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    odata[out_index] = sh_mem[sh_mem_index];
-  }
-}
-
-template <typename T>
-__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data,
-                                 int reverse_size, int outer_size,
-                                 int inner_size) {
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-  int item_per_block = 1024;
-
-  for (int block_offset = 0; block_offset < reverse_size;
-       block_offset += item_per_block) {
-    int valid_item = (reverse_size - block_offset > item_per_block)
-                         ? item_per_block
-                         : reverse_size - block_offset;
-    int src_offset =
-        bx * reverse_size + block_offset + by * (inner_size * reverse_size);
-    int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) +
-                     reverse_size - 1 - block_offset;
-    if (reverse_size < item_per_block) {
-      valid_item = reverse_size;
-    }
-
-    BlockReverse<T, 1024>(matrix_data, reverse_data, src_offset, dst_offset,
-                          valid_item);
-  }
-}
-
-template <typename T>
-struct BlockPrefixCallbackOp {
-  // Running prefix
-  T running_total;
-  // Constructor
-  __device__ BlockPrefixCallbackOp(T running_total)
-      : running_total(running_total) {}
-  // Callback operator to be entered by the first warp of threads in the block.
-  // Thread-0 is responsible for returning a value for seeding the block-wide
-  // scan.
-  __device__ T operator()(T block_aggregate) {
-    T old_prefix = running_total;
-    running_total = old_prefix + block_aggregate;
-    return old_prefix;
-  }
-};
-
-// No bank-conflict transpose
-template <typename T, int TILE_DIM, int BLOCK_ROWS>
-__global__ void MatrixTranspose(T* odata, const T* idata, size_t height,
-                                size_t width) {
-  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
-
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
-    if (x < width && (y + j) < height) {
-      tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x];
-    } else {
-      tile[threadIdx.y + j][threadIdx.x] = 0;
-    }
-  }
-
-  __syncthreads();
-
-  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * TILE_DIM + threadIdx.y;
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
-    if (x < height && (y + j) < width) {
-      odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
-    }
-  }
-}
-
-template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
-__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size,
-                                int outer_size, int scan_size, bool exclusive) {
-  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
-  typedef cub::BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD,
-                         cub::BLOCK_LOAD_TRANSPOSE>
-      BlockLoadT;
-  typedef cub::BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD,
-                          cub::BLOCK_STORE_TRANSPOSE>
-      BlockStoreT;
-  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
-  // Allocate type-safe, repurposable shared memory for collectives
-  __shared__ union {
-    typename BlockLoadT::TempStorage load;
-    typename BlockStoreT::TempStorage store;
-    typename BlockScanT::TempStorage scan;
-  } temp_storage;
-
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-
-  BlockPrefixCallbackOp<T> prefix_op(0);
-  T block_aggregate = static_cast<T>(0);
-
-  // Obtain this block's segment of consecutive keys (blocked across threads)
-  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
-  for (int block_offset = 0; block_offset < scan_size;
-       block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) {
-    int valid_item = (scan_size - block_offset > item_per_block)
-                         ? item_per_block
-                         : (scan_size - block_offset);
-    if (scan_size < item_per_block) {
-      valid_item = scan_size;
-    }
-
-    int offset = bx * scan_size + block_offset + by * (inner_size * scan_size);
-
-    T thread_keys[ITEMS_PER_THREAD];
-    BlockLoadT(temp_storage.load)
-        .Load(d_in + offset, thread_keys, valid_item, 0);
-
-    __syncthreads();
-    if (exclusive) {
-      T init_value = static_cast<T>(0);
-      BlockScanT(temp_storage.scan)
-          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
-    } else {
-      BlockScanT(temp_storage.scan)
-          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
-    }
-    __syncthreads();
-
-    BlockStoreT(temp_storage.store)
-        .Store(d_out + offset, thread_keys, valid_item);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CumCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto out_dims = out->dims();
-    auto size = in->numel();
-
-    PADDLE_ENFORCE_EQ(
-        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(axis) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-            out_dims.size(), out_dims.size() - 1, axis));
-    if (axis < 0) {
-      axis += out_dims.size();
-    }
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* in_data = in->data<T>();
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    if (size == out_dims[axis]) {
-      if (reverse) {
-        thrust::device_ptr<const T> dev_ptr =
-            thrust::device_pointer_cast(in_data);
-        thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
-        if (exclusive) {
-          thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
-                                 out_data);
-        } else {
-          thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
-                                 out_data);
-        }
-        thrust::reverse(thrust::device, out_data, out_data + size);
-      } else {
-        if (exclusive) {
-          thrust::exclusive_scan(thrust::device, in_data, in_data + size,
-                                 out_data);
-        } else {
-          thrust::inclusive_scan(thrust::device, in_data, in_data + size,
-                                 out_data);
-        }
-      }
-      return;
-    }
-
-    size_t height = 1;
-    size_t width = 1;
-    for (size_t i = 0; i <= axis; i++) {
-      height *= out_dims[i];
-    }
-
-    for (size_t i = axis + 1; i < out_dims.size(); i++) {
-      width *= out_dims[i];
-    }
-    int scan_size = out_dims[axis];
-    bool transpose = (axis != out_dims.size() - 1);
-
-    int tile_size = 32;
-    dim3 blocks(32, 8);
-    dim3 transpose_grids((width + tile_size - 1) / tile_size,
-                         (height + tile_size - 1) / tile_size);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    framework::Tensor tmp;
-    tmp.Resize(out_dims);
-    auto* tmp_data = tmp.mutable_data<T>(context.GetPlace());
-    T* next_in_data = out_data;
-    T* next_out_data = tmp_data;
-    if (transpose) {
-      MatrixTranspose<T, 32,
-                      8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
-          out_data, in_data, height, width);
-      next_in_data = out_data;
-      next_out_data = tmp_data;
-    }
-    auto swap_ptr = [](T*& ptr1, T*& ptr2) {
-      T* tmp = ptr2;
-      ptr2 = ptr1;
-      ptr1 = tmp;
-    };
-    int outer_size = height / scan_size;
-    int inner_size = width;
-    // Consider the size of shared memory, here block size is 128
-    dim3 scan_grid(outer_size, inner_size);
-    dim3 reverse_grid = scan_grid;
-    if (reverse) {
-      if (transpose) {
-        reverse_grid.x = scan_grid.y;
-        reverse_grid.y = scan_grid.x;
-        MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-            next_in_data, next_out_data, scan_size, outer_size, inner_size);
-        if (!transpose) next_in_data = tmp_data;
-        swap_ptr(next_in_data, next_out_data);
-      } else {
-        MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-            in_data, out_data, scan_size, outer_size, inner_size);
-      }
-    }
-    if (!transpose && !reverse) {
-      BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-          out_data, in_data, outer_size, inner_size, scan_size, exclusive);
-
-    } else {
-      BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-          next_out_data, next_in_data, outer_size, inner_size, scan_size,
-          exclusive);
-    }
-    swap_ptr(next_in_data, next_out_data);
-    if (reverse) {
-      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-          next_in_data, next_out_data, scan_size, outer_size, inner_size);
-      swap_ptr(next_in_data, next_out_data);
-    }
-    if (transpose) {
-      transpose_grids.x = (height + tile_size - 1) / tile_size;
-      transpose_grids.y = (width + tile_size - 1) / tile_size;
-      MatrixTranspose<T, 32,
-                      8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
-          next_out_data, next_in_data, width, height);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cumsum, ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 38bf53ca0aa..d197e4362e9 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/cum_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index df4d0ebbccd..2e596ff3e62 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -149,13 +149,3 @@ REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
                   ops::LogLossGradMaker<paddle::framework::OpDesc>,
                   ops::LogLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
deleted file mode 100644
index e7985ab810b..00000000000
--- a/paddle/fluid/operators/log_loss_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* loss_out = ctx.Output<Tensor>("Loss");
-
-    loss_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
-        place, loss, prediction, label, epsilon);
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-
-    auto dl = EigenVector<T>::Flatten(*dloss);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    if (dpred) {
-      dpred->mutable_data<T>(ctx.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
-          place, dx, dl, prediction, label, epsilon);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index 9775910bba5..f103a69707a 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index b2e68e9870d..aa5fdd86745 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -10,11 +10,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 2a3a0fa5d1f..54ecba08a82 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/auc_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -146,4 +146,3 @@ There are two types of possible curves:
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
-REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
deleted file mode 100644
index 1cb7eba8775..00000000000
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/metrics/auc_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
-                                        const int bucket_length,
-                                        const int slide_steps) {
-  int cur_step_index =
-      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
-  int cur_step_begin = cur_step_index * bucket_length;
-  int sum_step_begin = slide_steps * bucket_length;
-  CUDA_KERNEL_LOOP(i, bucket_length) {
-    pos[sum_step_begin + i] -= pos[cur_step_begin + i];
-    neg[sum_step_begin + i] -= neg[cur_step_begin + i];
-    pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0;
-  }
-}
-
-__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg,
-                                    const int bucket_length,
-                                    const int slide_steps) {
-  int cur_step_index =
-      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
-  int cur_step_begin = cur_step_index * bucket_length;
-  int sum_step_begin = slide_steps * bucket_length;
-  CUDA_KERNEL_LOOP(i, bucket_length) {
-    pos[sum_step_begin + i] += pos[cur_step_begin + i];
-    neg[sum_step_begin + i] += neg[cur_step_begin + i];
-  }
-}
-
-template <typename T>
-__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data,
-                              const int inference_width,
-                              const int num_thresholds, int64_t *pos,
-                              int64_t *neg, const int numel,
-                              const int slide_steps) {
-  int cur_step_begin = 0;
-  if (slide_steps > 0) {
-    int cur_step_index =
-        static_cast<int>(pos[(slide_steps + 1) * (1 + num_thresholds)]) %
-        slide_steps;
-    cur_step_begin = cur_step_index * (1 + num_thresholds);
-  }
-  CUDA_KERNEL_LOOP(i, numel) {
-    auto predict_data = pred_data[i * inference_width + (inference_width - 1)];
-    PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1.");
-    PADDLE_ENFORCE(predict_data >= 0,
-                   "The predict data must gather or equal 0.");
-    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-    if (label_data[i]) {
-      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
-    } else {
-      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
-    }
-  }
-}
-__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg,
-                              int num_thresholds, double *auc,
-                              bool need_add_batch_num) {
-  *auc = 0.0f;
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-
-  int idx = num_thresholds;
-
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += stat_pos[idx];
-    totNeg += stat_neg[idx];
-    *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0;
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    *auc = *auc / totPos / totNeg;
-  }
-  if (need_add_batch_num) {
-    stat_pos[num_thresholds + 1] += 1;
-    stat_neg[num_thresholds + 1] += 1;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class AucCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc_tensor = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    auto *auc_value = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    auto *stat_pos_in_tensor = ctx.Input<Tensor>("StatPos");
-    auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
-    auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
-    auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
-#ifdef PADDLE_WITH_CUDA
-    if (stat_pos_in_tensor != stat_pos) {
-      cudaMemcpy(origin_stat_pos, pos_in_data,
-                 ((1 + slide_steps) * (num_thresholds + 1) +
-                  (slide_steps > 0 ? 1 : 0)) *
-                     sizeof(int64_t),
-                 cudaMemcpyDeviceToDevice);
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      cudaMemcpy(origin_stat_neg, neg_in_data,
-                 ((1 + slide_steps) * (num_thresholds + 1) +
-                  (slide_steps > 0 ? 1 : 0)) *
-                     sizeof(int64_t),
-                 cudaMemcpyDeviceToDevice);
-    }
-#else
-    if (stat_pos_in_tensor != stat_pos) {
-      hipMemcpy(origin_stat_pos, pos_in_data,
-                ((1 + slide_steps) * (num_thresholds + 1) +
-                 (slide_steps > 0 ? 1 : 0)) *
-                    sizeof(int64_t),
-                hipMemcpyDeviceToDevice);
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      hipMemcpy(origin_stat_neg, neg_in_data,
-                ((1 + slide_steps) * (num_thresholds + 1) +
-                 (slide_steps > 0 ? 1 : 0)) *
-                    sizeof(int64_t),
-                hipMemcpyDeviceToDevice);
-    }
-#endif
-
-    statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos,
-            origin_stat_neg);
-    int sum_offset = slide_steps * (num_thresholds + 1);
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    CalcAucKernel<<<1, 1, 0, stream>>>(
-        origin_stat_pos + sum_offset, origin_stat_neg + sum_offset,
-        num_thresholds, auc_value, slide_steps > 0);
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos,
-                             int64_t *origin_stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-    const int bucket_length = num_thresholds + 1;
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    if (slide_steps == 0) {
-      AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          label_data, inference_data, inference_width, num_thresholds,
-          origin_stat_pos, origin_stat_neg, batch_size, slide_steps);
-      return;
-    }
-    // the last number of origin_stat_pos store the index should be used in
-    // current step
-    int cur_step_index =
-        static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
-        slide_steps;
-    int cur_step_begin = cur_step_index * bucket_length;
-    int sum_step_begin = slide_steps * bucket_length;
-
-    ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
-                                  PADDLE_CUDA_NUM_THREADS,
-                              PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
-
-    AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
-                        PADDLE_CUDA_NUM_THREADS,
-                    PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        label_data, inference_data, inference_width, num_thresholds,
-        origin_stat_pos, origin_stat_neg, batch_size, slide_steps);
-    UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
-                              PADDLE_CUDA_NUM_THREADS,
-                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(auc,
-                        ops::AucCUDAKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
deleted file mode 100644
index 10403472c69..00000000000
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AucKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc_tensor = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    auto *auc_value = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    // Just for pass UT, since UT's input & output connot be set same var
-    auto *stat_pos_in_tensor = ctx.Input<Tensor>("StatPos");
-    auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
-    auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
-    auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
-    if (stat_pos_in_tensor != stat_pos) {
-      memcpy(origin_stat_pos, pos_in_data,
-             ((1 + slide_steps) * (num_thresholds + 1) +
-              (slide_steps > 0 ? 1 : 0)) *
-                 sizeof(int64_t));
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      memcpy(origin_stat_neg, neg_in_data,
-             ((1 + slide_steps) * (num_thresholds + 1) +
-              (slide_steps > 0 ? 1 : 0)) *
-                 sizeof(int64_t));
-    }
-    statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos,
-            origin_stat_neg);
-
-    int sum_offset = slide_steps * (num_thresholds + 1);
-    calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset,
-            num_thresholds, auc_value);
-    if (slide_steps) {
-      origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1;
-      origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1;
-    }
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos,
-                             int64_t *origin_stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-    const int bucket_length = num_thresholds + 1;
-    if (slide_steps == 0) {
-      for (size_t i = 0; i < batch_size; i++) {
-        // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-        // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-        auto predict_data =
-            inference_data[i * inference_width + (inference_width - 1)];
-        PADDLE_ENFORCE_LE(predict_data, 1,
-                          platform::errors::PreconditionNotMet(
-                              "The predict data must less or equal 1."));
-        PADDLE_ENFORCE_GE(predict_data, 0,
-                          platform::errors::PreconditionNotMet(
-                              "The predict data must gather or equal 0."));
-
-        uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-        if (label_data[i] > 0) {
-          origin_stat_pos[binIdx] += 1;
-        } else if (label_data[i] == 0) {
-          origin_stat_neg[binIdx] += 1;
-        }
-      }
-      return;
-    }
-    // the last number of origin_stat_pos store the index should be used in
-    // current step
-    int cur_step_index =
-        static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
-        slide_steps;
-    int cur_step_begin = cur_step_index * bucket_length;
-    int sum_step_begin = slide_steps * bucket_length;
-    for (int i = 0; i < bucket_length; ++i) {
-      origin_stat_pos[sum_step_begin + i] -=
-          origin_stat_pos[cur_step_begin + i];
-      origin_stat_neg[sum_step_begin + i] -=
-          origin_stat_neg[cur_step_begin + i];
-    }
-
-    std::memset(origin_stat_pos + cur_step_begin, 0,
-                bucket_length * sizeof(int64_t));
-    std::memset(origin_stat_neg + cur_step_begin, 0,
-                bucket_length * sizeof(int64_t));
-
-    for (size_t i = 0; i < batch_size; i++) {
-      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-      auto predict_data =
-          inference_data[i * inference_width + (inference_width - 1)];
-      PADDLE_ENFORCE_LE(predict_data, 1,
-                        platform::errors::PreconditionNotMet(
-                            "The predict data must less or equal 1."));
-      PADDLE_ENFORCE_GE(predict_data, 0,
-                        platform::errors::PreconditionNotMet(
-                            "The predict data must gather or equal 0."));
-
-      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-      if (label_data[i] > 0) {
-        origin_stat_pos[cur_step_begin + binIdx] += 1;
-      } else if (label_data[i] == 0) {
-        origin_stat_neg[cur_step_begin + binIdx] += 1;
-      }
-    }
-    for (int i = 0; i < bucket_length; ++i) {
-      origin_stat_pos[sum_step_begin + i] +=
-          origin_stat_pos[cur_step_begin + i];
-      origin_stat_neg[sum_step_begin + i] +=
-          origin_stat_neg[cur_step_begin + i];
-    }
-  }
-
-  inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg,
-                             int num_thresholds, double *auc) {
-    *auc = 0.0f;
-
-    double totPos = 0.0;
-    double totNeg = 0.0;
-    double totPosPrev = 0.0;
-    double totNegPrev = 0.0;
-
-    int idx = num_thresholds;
-
-    while (idx >= 0) {
-      totPosPrev = totPos;
-      totNegPrev = totNeg;
-      totPos += stat_pos[idx];
-      totNeg += stat_neg[idx];
-      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-      --idx;
-    }
-
-    if (totPos > 0.0 && totNeg > 0.0) {
-      *auc = *auc / totPos / totNeg;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index a4e80343903..8e502fc04db 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
@@ -209,14 +210,3 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp,
                   ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             float>,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             double>);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
deleted file mode 100644
index 18402d908c4..00000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ /dev/null
@@ -1,264 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#ifdef __HIPCC__
-static constexpr int kNumCUDAThreads = 256;
-#else
-static constexpr int kNumCUDAThreads = 512;
-#endif
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-struct NonzeroFunctor {
-  HOSTDEVICE explicit inline NonzeroFunctor() {}
-  HOSTDEVICE inline T operator()(const T x) const {
-    return static_cast<T>(static_cast<double>(x) != 0);
-  }
-};
-
-template <typename T>
-struct SigmoidFwdFunctor {
-  T ignore_index_;
-  T eps = static_cast<T>(1e-5);
-
-  HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
-      : ignore_index_(ignore_index) {}
-
-  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
-    T counts;
-    T out_data;
-
-    T diff = label - static_cast<T>(ignore_index_);
-    if ((diff > -eps) && (diff < eps)) {
-      out_data = static_cast<T>(0.);
-      counts = 0;
-    } else {
-      T term1 = (x > 0) ? x : 0;
-      T term2 = x * label;
-      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
-
-      out_data = term1 - term2 + term3;
-      counts = 1;
-    }
-    phi::Array<T, 2> outs;
-
-    outs[0] = out_data;
-    outs[1] = counts;
-    return outs;
-  }
-};
-
-template <typename T>
-struct SigmoidBwdFunctor {
-  T ignore_index_;
-  T eps = static_cast<T>(1e-5);
-
-  HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
-      : ignore_index_(ignore_index) {}
-
-  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label,
-                                                const T dout) {
-    T counts;
-    T dx_data;
-
-    T diff = label - static_cast<T>(ignore_index_);
-    if ((diff > -eps) && (diff < eps)) {
-      dx_data = static_cast<T>(0.);
-      counts = 0;
-    } else {
-      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
-      T diff = simoid_x - label;
-      dx_data = dout * diff;
-      counts = 1;
-    }
-    phi::Array<T, 2> outs;
-
-    outs[0] = dx_data;
-    outs[1] = counts;
-    return outs;
-  }
-};
-
-template <typename T>
-struct DivFunctor {
-  const T norm_;
-  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
-
-  HOSTDEVICE inline T operator()(T loss) {
-    loss /= norm_;
-    return loss;
-  }
-};
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.cuda_device_context();
-    bool normalize = context.Attr<bool>("normalize");
-
-    // Temporary memory
-    Tensor *counts_tensor = new Tensor();
-    counts_tensor->mutable_data<T>(context.GetPlace(),
-                                   Labels->numel() * sizeof(T));
-    counts_tensor->Resize(Out->dims());
-    int limit = Out->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    std::vector<const framework::Tensor *> ins = {X, Labels};
-    std::vector<framework::Tensor *> outs = {Out, counts_tensor};
-    auto functor = SigmoidFwdFunctor<T>(ignore_index);
-    constexpr int Size = 2;
-    phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
-                                                              &outs, functor);
-    if (normalize) {
-      T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
-      Tensor *norm_tensor = new Tensor();
-      norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
-      auto dims = phi::vectorize(counts_tensor->dims());
-      std::vector<int> reduce_dim = {};
-      for (int i = 0; i < dims.size(); i++) {
-        reduce_dim.push_back(i);
-      }
-
-      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-          context.cuda_device_context(), *counts_tensor, norm_tensor,
-          NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
-      T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
-      auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
-      T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
-      memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
-                   sizeof(T), dev_ctx.stream());
-      auto eps = static_cast<T>(1e-5);
-      *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
-
-      std::vector<const framework::Tensor *> div_ins = {Out};
-      std::vector<framework::Tensor *> div_outs = {Out};
-      auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
-      phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
-                                       div_functor);
-
-      delete norm_tensor;
-      delete counts_tensor;
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-
-    auto &dev_ctx = context.cuda_device_context();
-    // Temporary memory
-    Tensor *counts_tensor = new Tensor();
-    counts_tensor->mutable_data<T>(context.GetPlace(),
-                                   Labels->numel() * sizeof(T));
-    counts_tensor->Resize(dX->dims());
-
-    int limit = dX->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    std::vector<const framework::Tensor *> ins = {X, Labels, dOut};
-    std::vector<framework::Tensor *> outs = {dX, counts_tensor};
-    auto functor = SigmoidBwdFunctor<T>(ignore_index);
-    constexpr int Size = 2;
-    phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
-                                                              &outs, functor);
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
-      Tensor *norm_tensor = new Tensor();
-      norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
-      auto dims = phi::vectorize(counts_tensor->dims());
-      std::vector<int> reduce_dim = {};
-      for (int i = 0; i < dims.size(); i++) {
-        reduce_dim.push_back(i);
-      }
-
-      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-          context.cuda_device_context(), *counts_tensor, norm_tensor,
-          NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
-      T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
-      auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
-      T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
-      memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
-                   sizeof(T), dev_ctx.stream());
-      auto eps = static_cast<T>(1e-5);
-      *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
-
-      std::vector<const framework::Tensor *> div_ins = {dX};
-      std::vector<framework::Tensor *> div_outs = {dX};
-      auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
-      phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
-                                       div_functor);
-      delete norm_tensor;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
deleted file mode 100644
index d2ced490cef..00000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-const int kIgnoreIndex = -100;
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-    int limit = Out->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        out_data[idx] = static_cast<T>(0.);
-      } else {
-        T term1 = (x > 0) ? x : 0;
-        T term2 = x * label;
-        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
-        out_data[idx] = term1 - term2 + term3;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-    int limit = dX->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    auto dout_data = dOut->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      T dout = dout_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        dx_data[idx] = static_cast<T>(0.);
-      } else {
-        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-        T diff = simoid_x - label;
-        dx_data[idx] = dout * diff;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 40852425997..f186f95a2b9 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
   // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
index 6395aa1caa0..c37731580d1 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
@@ -17,13 +17,15 @@
 #include <memory>
 #include <vector>
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h
new file mode 100644
index 00000000000..acbd17c7801
--- /dev/null
+++ b/paddle/phi/kernels/auc_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AucKernel(const Context& dev_ctx,
+               const DenseTensor& input,
+               const DenseTensor& label,
+               const DenseTensor& stat_pos,
+               const DenseTensor& stat_neg,
+               const std::string& curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor* auc,
+               DenseTensor* stat_pos_out,
+               DenseTensor* stat_neg_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/auc_kernel.cc b/paddle/phi/kernels/cpu/auc_kernel.cc
new file mode 100644
index 00000000000..bc25091de75
--- /dev/null
+++ b/paddle/phi/kernels/cpu/auc_kernel.cc
@@ -0,0 +1,190 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/auc_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) {
+  return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+}
+
+template <typename T>
+void statAuc(const DenseTensor &label,
+             const DenseTensor &predict,
+             const int num_thresholds,
+             const int slide_steps,
+             int64_t *origin_stat_pos,
+             int64_t *origin_stat_neg) {
+  size_t batch_size = predict.dims()[0];
+  size_t inference_width = predict.dims()[1];
+  const T *inference_data = predict.data<T>();
+  const auto *label_data = label.data<int64_t>();
+  const int bucket_length = num_thresholds + 1;
+  if (slide_steps == 0) {
+    for (size_t i = 0; i < batch_size; i++) {
+      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
+      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
+      auto predict_data =
+          inference_data[i * inference_width + (inference_width - 1)];
+      PADDLE_ENFORCE_LE(predict_data,
+                        1,
+                        phi::errors::PreconditionNotMet(
+                            "The predict data must less or equal 1."));
+      PADDLE_ENFORCE_GE(predict_data,
+                        0,
+                        phi::errors::PreconditionNotMet(
+                            "The predict data must gather or equal 0."));
+
+      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+      if (label_data[i] > 0) {
+        origin_stat_pos[binIdx] += 1;
+      } else if (label_data[i] == 0) {
+        origin_stat_neg[binIdx] += 1;
+      }
+    }
+    return;
+  }
+  // the last number of origin_stat_pos store the index should be used in
+  // current step
+  int cur_step_index =
+      static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
+      slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  for (int i = 0; i < bucket_length; ++i) {
+    origin_stat_pos[sum_step_begin + i] -= origin_stat_pos[cur_step_begin + i];
+    origin_stat_neg[sum_step_begin + i] -= origin_stat_neg[cur_step_begin + i];
+  }
+
+  std::memset(
+      origin_stat_pos + cur_step_begin, 0, bucket_length * sizeof(int64_t));
+  std::memset(
+      origin_stat_neg + cur_step_begin, 0, bucket_length * sizeof(int64_t));
+
+  for (size_t i = 0; i < batch_size; i++) {
+    // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
+    // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
+    auto predict_data =
+        inference_data[i * inference_width + (inference_width - 1)];
+    PADDLE_ENFORCE_LE(predict_data,
+                      1,
+                      phi::errors::PreconditionNotMet(
+                          "The predict data must less or equal 1."));
+    PADDLE_ENFORCE_GE(predict_data,
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "The predict data must gather or equal 0."));
+
+    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+    if (label_data[i] > 0) {
+      origin_stat_pos[cur_step_begin + binIdx] += 1;
+    } else if (label_data[i] == 0) {
+      origin_stat_neg[cur_step_begin + binIdx] += 1;
+    }
+  }
+  for (int i = 0; i < bucket_length; ++i) {
+    origin_stat_pos[sum_step_begin + i] += origin_stat_pos[cur_step_begin + i];
+    origin_stat_neg[sum_step_begin + i] += origin_stat_neg[cur_step_begin + i];
+  }
+}
+
+inline static void calcAuc(const int64_t *stat_pos,
+                           const int64_t *stat_neg,
+                           int num_thresholds,
+                           double *auc) {
+  *auc = 0.0f;
+
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+
+  int idx = num_thresholds;
+
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += stat_pos[idx];
+    totNeg += stat_neg[idx];
+    *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    *auc = *auc / totPos / totNeg;
+  }
+}
+
+template <typename T, typename Context>
+void AucKernel(const Context &dev_ctx,
+               const DenseTensor &input,
+               const DenseTensor &label,
+               const DenseTensor &stat_pos,
+               const DenseTensor &stat_neg,
+               const std::string &curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor *auc,
+               DenseTensor *stat_pos_out,
+               DenseTensor *stat_neg_out) {
+  // Only use output var for now, make sure it's persistable and
+  // not cleaned up for each batch.
+  auto *origin_stat_pos = dev_ctx.template Alloc<int64_t>(stat_pos_out);
+  auto *origin_stat_neg = dev_ctx.template Alloc<int64_t>(stat_neg_out);
+  auto *auc_value = dev_ctx.template Alloc<double>(auc);
+
+  // Just for pass UT, since UT's input & output connot be set same var
+  auto *stat_pos_in_tensor = &stat_pos;
+  auto *stat_neg_in_tensor = &stat_neg;
+  auto *pos_in_data = stat_pos.data<int64_t>();
+  auto *neg_in_data = stat_neg.data<int64_t>();
+  if (stat_pos_in_tensor != stat_pos_out) {
+    memcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t));
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    memcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t));
+  }
+  statAuc<T>(label,
+             input,
+             num_thresholds,
+             slide_steps,
+             origin_stat_pos,
+             origin_stat_neg);
+
+  int sum_offset = slide_steps * (num_thresholds + 1);
+  calcAuc(origin_stat_pos + sum_offset,
+          origin_stat_neg + sum_offset,
+          num_thresholds,
+          auc_value);
+  if (slide_steps) {
+    origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1;
+    origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(auc, CPU, ALL_LAYOUT, phi::AucKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/cumsum_kernel.cc b/paddle/phi/kernels/cpu/cumsum_kernel.cc
new file mode 100644
index 00000000000..d32e18479aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumsum_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+struct CumsumFunctor {
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x,
+                                               int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+template <typename Device, typename Dim, typename X, typename Out>
+void ComputeImp(Device d,
+                const Dim& dims,
+                X x,
+                Out out,
+                int axis,
+                bool reverse,
+                bool exclusive) {
+  if (!reverse) {
+    out.reshape(dims).device(d) =
+        CumsumFunctor()(x.reshape(dims), axis, exclusive);
+  } else {
+    std::array<bool, Dim::count> rev;
+    rev.fill(false);
+    rev[axis] = reverse;
+    out.reshape(dims).device(d) =
+        CumsumFunctor()(x.reshape(dims).reverse(rev), axis, exclusive)
+            .reverse(rev);
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  auto out_dims = out->dims();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  int pre = 1;
+  int post = 1;
+  int mid = out_dims[axis];
+  for (int i = 0; i < axis; ++i) {
+    pre *= out_dims[i];
+  }
+  for (int i = axis + 1; i < out_dims.size(); ++i) {
+    post *= out_dims[i];
+  }
+
+  auto x0 = EigenVector<T>::Flatten(x);
+  auto out0 = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  using IndexT = Eigen::DenseIndex;
+  if (pre == 1) {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 1>(mid),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive);
+    }
+  } else {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(pre, mid),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 3>(pre, mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
new file mode 100644
index 00000000000..2e2d94df59e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    log_loss_grad, CPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/log_loss_kernel.cc b/paddle/phi/kernels/cpu/log_loss_kernel.cc
new file mode 100644
index 00000000000..38e93486f7b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_loss_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(log_loss, CPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
new file mode 100644
index 00000000000..468db18aa21
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(in_grad);
+
+  int limit = in_grad->numel();
+  auto x_data = x.data<T>();
+  auto label_data = label.data<T>();
+  auto dout_data = out_grad.data<T>();
+  for (int idx = 0; idx < limit; ++idx) {
+    T x = x_data[idx];
+    T label = label_data[idx];
+    T dout = dout_data[idx];
+    if (static_cast<int>(label) == ignore_index) {
+      dx_data[idx] = static_cast<T>(0.);
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+      T diff = simoid_x - label;
+      dx_data[idx] = dout * diff;
+    }
+  }
+  if (normalize) {
+    int norm = 0;
+    T eps = static_cast<T>(1e-6);
+    for (int idx = 0; idx < limit; ++idx) {
+      T diff = label_data[idx] - static_cast<T>(ignore_index);
+      if ((diff < -eps) || (diff > eps)) {
+        norm += 1;
+      }
+    }
+    eps = static_cast<T>(1e-5);
+    norm = norm > eps ? norm : eps;
+    std::for_each(dx_data, dx_data + limit, [norm](T& v) { v = v / norm; });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
new file mode 100644
index 00000000000..366d300320b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out) {
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  int limit = out->numel();
+  auto x_data = x.data<T>();
+  auto label_data = label.data<T>();
+  for (int idx = 0; idx < limit; ++idx) {
+    T x = x_data[idx];
+    T label = label_data[idx];
+    if (static_cast<int>(label) == ignore_index) {
+      out_data[idx] = static_cast<T>(0.);
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+      out_data[idx] = term1 - term2 + term3;
+    }
+  }
+
+  if (normalize) {
+    int norm = 0;
+    T eps = static_cast<T>(1e-6);
+    for (int idx = 0; idx < limit; ++idx) {
+      T diff = label_data[idx] - static_cast<T>(ignore_index);
+      if ((diff < -eps) || (diff > eps)) {
+        norm += 1;
+      }
+    }
+    eps = static_cast<T>(1e-5);
+    norm = norm > eps ? norm : eps;
+    std::for_each(out_data, out_data + limit, [norm](T& v) { v = v / norm; });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h
new file mode 100644
index 00000000000..fd90c7b8f5e
--- /dev/null
+++ b/paddle/phi/kernels/cumsum_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename Functor, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
new file mode 100644
index 00000000000..5a1bb9874fe
--- /dev/null
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -0,0 +1,258 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/auc_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+__global__ void ClearObsoleteDataKernel(int64_t *pos,
+                                        int64_t *neg,
+                                        const int bucket_length,
+                                        const int slide_steps) {
+  int cur_step_index =
+      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  CUDA_KERNEL_LOOP(i, bucket_length) {
+    pos[sum_step_begin + i] -= pos[cur_step_begin + i];
+    neg[sum_step_begin + i] -= neg[cur_step_begin + i];
+    pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0;
+  }
+}
+
+__global__ void UpdateSumDataKernel(int64_t *pos,
+                                    int64_t *neg,
+                                    const int bucket_length,
+                                    const int slide_steps) {
+  int cur_step_index =
+      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  CUDA_KERNEL_LOOP(i, bucket_length) {
+    pos[sum_step_begin + i] += pos[cur_step_begin + i];
+    neg[sum_step_begin + i] += neg[cur_step_begin + i];
+  }
+}
+
+template <typename T>
+__global__ void AddDataKernel(const int64_t *label_data,
+                              const T *pred_data,
+                              const int inference_width,
+                              const int num_thresholds,
+                              int64_t *pos,
+                              int64_t *neg,
+                              const int numel,
+                              const int slide_steps) {
+  int cur_step_begin = 0;
+  if (slide_steps > 0) {
+    int cur_step_index =
+        static_cast<int>(pos[(slide_steps + 1) * (1 + num_thresholds)]) %
+        slide_steps;
+    cur_step_begin = cur_step_index * (1 + num_thresholds);
+  }
+  CUDA_KERNEL_LOOP(i, numel) {
+    auto predict_data = pred_data[i * inference_width + (inference_width - 1)];
+    PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1.");
+    PADDLE_ENFORCE(predict_data >= 0,
+                   "The predict data must gather or equal 0.");
+    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+    if (label_data[i]) {
+      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
+    } else {
+      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
+    }
+  }
+}
+
+__global__ void CalcAucKernel(int64_t *stat_pos,
+                              int64_t *stat_neg,
+                              int num_thresholds,
+                              double *auc,
+                              bool need_add_batch_num) {
+  *auc = 0.0f;
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+
+  int idx = num_thresholds;
+
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += stat_pos[idx];
+    totNeg += stat_neg[idx];
+    *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0;
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    *auc = *auc / totPos / totNeg;
+  }
+  if (need_add_batch_num) {
+    stat_pos[num_thresholds + 1] += 1;
+    stat_neg[num_thresholds + 1] += 1;
+  }
+}
+
+inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) {
+  return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+}
+
+template <typename T, typename Context>
+void statAuc(const Context &dev_ctx,
+             const DenseTensor &label,
+             const DenseTensor &predict,
+             const int num_thresholds,
+             const int slide_steps,
+             int64_t *origin_stat_pos,
+             int64_t *origin_stat_neg) {
+  size_t batch_size = predict.dims()[0];
+  size_t inference_width = predict.dims()[1];
+  const T *inference_data = predict.data<T>();
+  const auto *label_data = label.data<int64_t>();
+  const int bucket_length = num_thresholds + 1;
+
+  if (slide_steps == 0) {
+    AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
+                        PADDLE_CUDA_NUM_THREADS,
+                    PADDLE_CUDA_NUM_THREADS,
+                    0,
+                    dev_ctx.stream()>>>(label_data,
+                                        inference_data,
+                                        inference_width,
+                                        num_thresholds,
+                                        origin_stat_pos,
+                                        origin_stat_neg,
+                                        batch_size,
+                                        slide_steps);
+    return;
+  }
+  // the last number of origin_stat_pos store the index should be used in
+  // current step
+  int cur_step_index =
+      static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
+      slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+
+  ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
+                                PADDLE_CUDA_NUM_THREADS,
+                            PADDLE_CUDA_NUM_THREADS,
+                            0,
+                            dev_ctx.stream()>>>(
+      origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
+
+  AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
+                      PADDLE_CUDA_NUM_THREADS,
+                  PADDLE_CUDA_NUM_THREADS,
+                  0,
+                  dev_ctx.stream()>>>(label_data,
+                                      inference_data,
+                                      inference_width,
+                                      num_thresholds,
+                                      origin_stat_pos,
+                                      origin_stat_neg,
+                                      batch_size,
+                                      slide_steps);
+  UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
+                            PADDLE_CUDA_NUM_THREADS,
+                        PADDLE_CUDA_NUM_THREADS,
+                        0,
+                        dev_ctx.stream()>>>(
+      origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
+}
+
+template <typename T, typename Context>
+void AucKernel(const Context &dev_ctx,
+               const DenseTensor &input,
+               const DenseTensor &label,
+               const DenseTensor &stat_pos,
+               const DenseTensor &stat_neg,
+               const std::string &curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor *auc,
+               DenseTensor *stat_pos_out,
+               DenseTensor *stat_neg_out) {
+  // Only use output var for now, make sure it's persistable and
+  // not cleaned up for each batch.
+  auto *origin_stat_pos = dev_ctx.template Alloc<int64_t>(stat_pos_out);
+  auto *origin_stat_neg = dev_ctx.template Alloc<int64_t>(stat_neg_out);
+  auto *auc_value = dev_ctx.template Alloc<double>(auc);
+
+  auto *stat_pos_in_tensor = &stat_pos;
+  auto *stat_neg_in_tensor = &stat_neg;
+  auto *pos_in_data = stat_pos.data<int64_t>();
+  auto *neg_in_data = stat_neg.data<int64_t>();
+#ifdef PADDLE_WITH_CUDA
+  if (stat_pos_in_tensor != stat_pos_out) {
+    cudaMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        cudaMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    cudaMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        cudaMemcpyDeviceToDevice);
+  }
+#else
+  if (stat_pos_in_tensor != stat_pos_out) {
+    hipMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        hipMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    hipMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        hipMemcpyDeviceToDevice);
+  }
+#endif
+
+  statAuc<T, Context>(dev_ctx,
+                      label,
+                      input,
+                      num_thresholds,
+                      slide_steps,
+                      origin_stat_pos,
+                      origin_stat_neg);
+  int sum_offset = slide_steps * (num_thresholds + 1);
+  CalcAucKernel<<<1, 1, 0, dev_ctx.stream()>>>(origin_stat_pos + sum_offset,
+                                               origin_stat_neg + sum_offset,
+                                               num_thresholds,
+                                               auc_value,
+                                               slide_steps > 0);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(auc, GPU, ALL_LAYOUT, phi::AucKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
new file mode 100644
index 00000000000..a253e6f4ad2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -0,0 +1,336 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, int BLOCK_SIZE>
+__device__ void BlockReverse(
+    const T* idata, T* odata, int src_base, int dst_base, int valid_item) {
+  __shared__ T sh_mem[BLOCK_SIZE];
+  int tx = threadIdx.x;
+
+  int offset = tx;
+  int in_index = src_base + offset;
+  if (offset >= valid_item) {
+    sh_mem[offset] = 0;
+  } else {
+    int sh_mem_index = BLOCK_SIZE - offset - 1;
+    T data = idata[in_index];
+    sh_mem[sh_mem_index] = data;
+  }
+
+  __syncthreads();
+  int out_index = dst_base - offset;
+  if (offset < valid_item) {
+    int sh_mem_index = BLOCK_SIZE - offset - 1;
+    odata[out_index] = sh_mem[sh_mem_index];
+  }
+}
+
+template <typename T>
+__global__ void MatrixRowReverse(const T* matrix_data,
+                                 T* reverse_data,
+                                 int reverse_size,
+                                 int outer_size,
+                                 int inner_size) {
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+  int item_per_block = 1024;
+
+  for (int block_offset = 0; block_offset < reverse_size;
+       block_offset += item_per_block) {
+    int valid_item = (reverse_size - block_offset > item_per_block)
+                         ? item_per_block
+                         : reverse_size - block_offset;
+    int src_offset =
+        bx * reverse_size + block_offset + by * (inner_size * reverse_size);
+    int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) +
+                     reverse_size - 1 - block_offset;
+    if (reverse_size < item_per_block) {
+      valid_item = reverse_size;
+    }
+
+    BlockReverse<T, 1024>(
+        matrix_data, reverse_data, src_offset, dst_offset, valid_item);
+  }
+}
+
+template <typename T>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total;
+  // Constructor
+  __device__ BlockPrefixCallbackOp(T running_total)
+      : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total;
+    running_total = old_prefix + block_aggregate;
+    return old_prefix;
+  }
+};
+
+// No bank-conflict transpose
+template <typename T, int TILE_DIM, int BLOCK_ROWS>
+__global__ void MatrixTranspose(T* odata,
+                                const T* idata,
+                                size_t height,
+                                size_t width) {
+  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
+
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+    if (x < width && (y + j) < height) {
+      tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x];
+    } else {
+      tile[threadIdx.y + j][threadIdx.x] = 0;
+    }
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * TILE_DIM + threadIdx.y;
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+    if (x < height && (y + j) < width) {
+      odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void BlockScanKernel(T* d_out,
+                                const T* d_in,
+                                int inner_size,
+                                int outer_size,
+                                int scan_size,
+                                bool exclusive) {
+  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
+  typedef cub::
+      BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
+          BlockLoadT;
+  typedef cub::
+      BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE>
+          BlockStoreT;
+  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
+  // Allocate type-safe, repurposable shared memory for collectives
+  __shared__ union {
+    typename BlockLoadT::TempStorage load;
+    typename BlockStoreT::TempStorage store;
+    typename BlockScanT::TempStorage scan;
+  } temp_storage;
+
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+
+  BlockPrefixCallbackOp<T> prefix_op(0);
+  T block_aggregate = static_cast<T>(0);
+
+  // Obtain this block's segment of consecutive keys (blocked across threads)
+  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
+  for (int block_offset = 0; block_offset < scan_size;
+       block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) {
+    int valid_item = (scan_size - block_offset > item_per_block)
+                         ? item_per_block
+                         : (scan_size - block_offset);
+    if (scan_size < item_per_block) {
+      valid_item = scan_size;
+    }
+
+    int offset = bx * scan_size + block_offset + by * (inner_size * scan_size);
+
+    T thread_keys[ITEMS_PER_THREAD];
+    BlockLoadT(temp_storage.load)
+        .Load(d_in + offset, thread_keys, valid_item, 0);
+
+    __syncthreads();
+    if (exclusive) {
+      T init_value = static_cast<T>(0);
+      BlockScanT(temp_storage.scan)
+          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+    } else {
+      BlockScanT(temp_storage.scan)
+          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+    }
+    __syncthreads();
+
+    BlockStoreT(temp_storage.store)
+        .Store(d_out + offset, thread_keys, valid_item);
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  auto out_dims = out->dims();
+  auto size = x.numel();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  const T* in_data = x.data<T>();
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  if (size == out_dims[axis]) {
+    if (reverse) {
+      thrust::device_ptr<const T> dev_ptr =
+          thrust::device_pointer_cast(in_data);
+      thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
+      if (exclusive) {
+        thrust::exclusive_scan(
+            thrust::device, vec.rbegin(), vec.rend(), out_data);
+      } else {
+        thrust::inclusive_scan(
+            thrust::device, vec.rbegin(), vec.rend(), out_data);
+      }
+      thrust::reverse(thrust::device, out_data, out_data + size);
+    } else {
+      if (exclusive) {
+        thrust::exclusive_scan(
+            thrust::device, in_data, in_data + size, out_data);
+      } else {
+        thrust::inclusive_scan(
+            thrust::device, in_data, in_data + size, out_data);
+      }
+    }
+    return;
+  }
+
+  size_t height = 1;
+  size_t width = 1;
+  for (size_t i = 0; i <= axis; i++) {
+    height *= out_dims[i];
+  }
+
+  for (size_t i = axis + 1; i < out_dims.size(); i++) {
+    width *= out_dims[i];
+  }
+  int scan_size = out_dims[axis];
+  bool transpose = (axis != out_dims.size() - 1);
+
+  int tile_size = 32;
+  dim3 blocks(32, 8);
+  dim3 transpose_grids((width + tile_size - 1) / tile_size,
+                       (height + tile_size - 1) / tile_size);
+  out->Resize(out_dims);
+  auto* tmp_data = out->data<T>();
+
+  T* next_in_data = out_data;
+  T* next_out_data = tmp_data;
+  if (transpose) {
+    MatrixTranspose<T, 32, 8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
+        out_data, in_data, height, width);
+    next_in_data = out_data;
+    next_out_data = tmp_data;
+  }
+  auto swap_ptr = [](T*& ptr1, T*& ptr2) {
+    T* tmp = ptr2;
+    ptr2 = ptr1;
+    ptr1 = tmp;
+  };
+  int outer_size = height / scan_size;
+  int inner_size = width;
+  // Consider the size of shared memory, here block size is 128
+  dim3 scan_grid(outer_size, inner_size);
+  dim3 reverse_grid = scan_grid;
+  if (reverse) {
+    if (transpose) {
+      reverse_grid.x = scan_grid.y;
+      reverse_grid.y = scan_grid.x;
+      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+          next_in_data, next_out_data, scan_size, outer_size, inner_size);
+      if (!transpose) next_in_data = tmp_data;
+      swap_ptr(next_in_data, next_out_data);
+    } else {
+      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+          in_data, out_data, scan_size, outer_size, inner_size);
+    }
+  }
+  if (!transpose && !reverse) {
+    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        out_data, in_data, outer_size, inner_size, scan_size, exclusive);
+
+  } else {
+    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        next_out_data,
+        next_in_data,
+        outer_size,
+        inner_size,
+        scan_size,
+        exclusive);
+  }
+  swap_ptr(next_in_data, next_out_data);
+  if (reverse) {
+    MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+        next_in_data, next_out_data, scan_size, outer_size, inner_size);
+    swap_ptr(next_in_data, next_out_data);
+  }
+  if (transpose) {
+    transpose_grids.x = (height + tile_size - 1) / tile_size;
+    transpose_grids.y = (width + tile_size - 1) / tile_size;
+    MatrixTranspose<T, 32, 8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
+        next_out_data, next_in_data, width, height);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
new file mode 100644
index 00000000000..3bb256ad032
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu
new file mode 100644
index 00000000000..0934520ea4a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
new file mode 100644
index 00000000000..6f9cda83a9a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+namespace phi {
+
+#ifdef __HIPCC__
+static constexpr int kNumCUDAThreads = 256;
+#else
+static constexpr int kNumCUDAThreads = 512;
+#endif
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+struct NonzeroFunctor {
+  HOSTDEVICE explicit inline NonzeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T x) const {
+    return static_cast<T>(static_cast<double>(x) != 0);
+  }
+};
+
+template <typename T>
+struct DivFunctor {
+  const T norm_;
+  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
+
+  HOSTDEVICE inline T operator()(T loss) {
+    loss /= norm_;
+    return loss;
+  }
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
new file mode 100644
index 00000000000..ae3cefd9e82
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+
+#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+
+namespace phi {
+
+template <typename T>
+struct SigmoidBwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x,
+                                                const T label,
+                                                const T dout) {
+    T counts;
+    T dx_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data = static_cast<T>(0.);
+      counts = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) /
+                   (static_cast<T>(1) + paddle::operators::real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data = dout * diff;
+      counts = 1;
+    }
+    phi::Array<T, 2> outs;
+
+    outs[0] = dx_data;
+    outs[1] = counts;
+    return outs;
+  }
+};
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
+                                             const DenseTensor &x,
+                                             const DenseTensor &label,
+                                             const DenseTensor &out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor *in_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(in_grad);
+
+  // Temporary memory
+  DenseTensor *counts_tensor = new DenseTensor();
+
+  int64_t out_dims = label.numel() * sizeof(T);
+  counts_tensor->Resize({out_dims});
+  dev_ctx.template Alloc<T>(counts_tensor);
+  counts_tensor->Resize(in_grad->dims());
+
+  int limit = in_grad->numel();
+  int blocks = NumBlocks(limit);
+  int threads = kNumCUDAThreads;
+  std::vector<const DenseTensor *> ins = {&x, &label, &out_grad};
+  std::vector<DenseTensor *> outs = {in_grad, counts_tensor};
+  auto functor = SigmoidBwdFunctor<T>(ignore_index);
+  constexpr int Size = 2;
+  phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(
+      dev_ctx, ins, &outs, functor);
+  if (normalize) {
+    T *counts = dev_ctx.template Alloc<T>(counts_tensor);
+    DenseTensor *norm_tensor = new DenseTensor();
+    norm_tensor->Resize({sizeof(T)});
+    dev_ctx.template Alloc<T>(norm_tensor);
+    auto dims = phi::vectorize(counts_tensor->dims());
+    std::vector<int> reduce_dim = {};
+    for (int i = 0; i < dims.size(); i++) {
+      reduce_dim.push_back(i);
+    }
+
+    kernels::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx,
+        *counts_tensor,
+        norm_tensor,
+        NonzeroFunctor<T>(),
+        reduce_dim,
+        dev_ctx.stream());
+    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
+    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         norm_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         norm,
+                         sizeof(T),
+                         dev_ctx.stream());
+    auto eps = static_cast<T>(1e-5);
+    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+    std::vector<const DenseTensor *> div_ins = {in_grad};
+    std::vector<DenseTensor *> div_outs = {in_grad};
+    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+    delete norm_tensor;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
new file mode 100644
index 00000000000..fb63badf56a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -0,0 +1,128 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+
+#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+
+namespace phi {
+
+template <typename T>
+struct SigmoidFwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
+    T counts;
+    T out_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data = static_cast<T>(0.);
+      counts = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = paddle::operators::real_log(
+          static_cast<T>(1) +
+          paddle::operators::real_exp(static_cast<T>(-abs(x))));
+
+      out_data = term1 - term2 + term3;
+      counts = 1;
+    }
+    phi::Array<T, 2> outs;
+
+    outs[0] = out_data;
+    outs[1] = counts;
+    return outs;
+  }
+};
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
+                                         const DenseTensor &x,
+                                         const DenseTensor &label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor *out) {
+  auto out_data = dev_ctx.template Alloc<T>(out);
+
+  // Temporary memory
+  DenseTensor *counts_tensor = new DenseTensor();
+
+  int64_t out_dims = label.numel() * sizeof(T);
+  counts_tensor->Resize({out_dims});
+  dev_ctx.template Alloc<T>(counts_tensor);
+  counts_tensor->Resize(out->dims());
+
+  int limit = out->numel();
+  int blocks = NumBlocks(limit);
+  int threads = kNumCUDAThreads;
+  std::vector<const DenseTensor *> ins = {&x, &label};
+  std::vector<DenseTensor *> outs = {out, counts_tensor};
+  auto functor = SigmoidFwdFunctor<T>(ignore_index);
+  constexpr int Size = 2;
+  phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(
+      dev_ctx, ins, &outs, functor);
+  if (normalize) {
+    T *counts = dev_ctx.template Alloc<T>(counts_tensor);
+    DenseTensor *norm_tensor = new DenseTensor();
+    norm_tensor->Resize({sizeof(T)});
+    dev_ctx.template Alloc<T>(norm_tensor);
+    auto dims = phi::vectorize(counts_tensor->dims());
+    std::vector<int> reduce_dim = {};
+    for (int i = 0; i < dims.size(); i++) {
+      reduce_dim.push_back(i);
+    }
+
+    kernels::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx,
+        *counts_tensor,
+        norm_tensor,
+        NonzeroFunctor<T>(),
+        reduce_dim,
+        dev_ctx.stream());
+    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
+    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         norm_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         norm,
+                         sizeof(T),
+                         dev_ctx.stream());
+    auto eps = static_cast<T>(1e-5);
+    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+    std::vector<const DenseTensor *> div_ins = {out};
+    std::vector<DenseTensor *> div_outs = {out};
+    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
+    delete norm_tensor;
+    delete counts_tensor;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
new file mode 100644
index 00000000000..6f84133d5f4
--- /dev/null
+++ b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       float epsilon,
+                       DenseTensor* in_grad) {
+  auto prediction = EigenVector<T>::Flatten(input);
+  auto label_out = EigenVector<T>::Flatten(label);
+
+  auto dl = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+
+  if (in_grad) {
+    dev_ctx.template Alloc<T>(in_grad);
+    auto dx = EigenVector<T>::Flatten(*in_grad);
+    phi::funcs::EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+        place, dx, dl, prediction, label_out, epsilon);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/log_loss_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_kernel_impl.h
new file mode 100644
index 00000000000..d49144c8354
--- /dev/null
+++ b/paddle/phi/kernels/impl/log_loss_kernel_impl.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   float epsilon,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto prediction = EigenVector<T>::Flatten(input);
+  auto label_out = EigenVector<T>::Flatten(label);
+
+  auto loss = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  phi::funcs::EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+      place, loss, prediction, label_out, epsilon);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_loss_grad_kernel.h b/paddle/phi/kernels/log_loss_grad_kernel.h
new file mode 100644
index 00000000000..6853140b19b
--- /dev/null
+++ b/paddle/phi/kernels/log_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       float epsilon,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_loss_kernel.h b/paddle/phi/kernels/log_loss_kernel.h
new file mode 100644
index 00000000000..cd16c0f2c7c
--- /dev/null
+++ b/paddle/phi/kernels/log_loss_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   float epsilon,
+                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
new file mode 100644
index 00000000000..6bc75b7670f
--- /dev/null
+++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
new file mode 100644
index 00000000000..7ea3e6589f7
--- /dev/null
+++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc
new file mode 100644
index 00000000000..c4ae746e975
--- /dev/null
+++ b/paddle/phi/ops/compat/log_loss_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("log_loss_grad",
+                         {"Predicted", "Labels", GradVarName("Loss")},
+                         {"epsilon"},
+                         {GradVarName("Predicted")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc
new file mode 100644
index 00000000000..61ad9627a96
--- /dev/null
+++ b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_cross_entropy_with_logits_grad",
+                         {"X", "Label", GradVarName("Out")},
+                         {"normalize", "ignore_index"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(
+    sigmoid_cross_entropy_with_logits_grad,
+    phi::SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping);
-- 
GitLab


From 837406551260414ab18689251e3b2422a10faf69 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Fri, 4 Mar 2022 15:47:46 +0800
Subject: [PATCH 126/272] [Phi] move gaussian_random, fix fp16 (#40122)

[Phi] move gaussian_random, fix fp16
---
 paddle/phi/kernels/gpu/gaussian_random_kernel.cu | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index d5acc60a360..da16800ad02 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -81,22 +81,25 @@ void GaussianRandomKernel(const Context& dev_ctx,
   int device_id = dev_ctx.GetPlace().GetDeviceId();
   auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
 
-  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
   if (gen_cuda->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
+      using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
       funcs::normal_distribution<MT> dist;
       funcs::normal_transform<MT> trans(mean, std);
       funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
     } else {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
-      auto func =
-          GaussianGenerator<MT>(mean, std, seed_offset.first, gen_offset);
-      IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+      auto func = GaussianGenerator<T>(static_cast<T>(mean),
+                                       static_cast<T>(std),
+                                       seed_offset.first,
+                                       gen_offset);
+      IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
     }
   } else {
-    auto func = GaussianGenerator<MT>(mean, std, seed);
-    IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+    auto func =
+        GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
   }
 }
 
-- 
GitLab


From a7e4cdaf658697b50cd2be3616e017b6e1c49cb0 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Fri, 4 Mar 2022 16:03:30 +0800
Subject: [PATCH 127/272] [ROCm] fix hip test to update LD_LIBRARY_PATH,
 test=develop (#40153)

---
 cmake/generic.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index da81575188f..ba59eae392c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -651,6 +651,7 @@ function(hip_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH")
   endif()
 endfunction(hip_test)
 
-- 
GitLab


From 880dec0fef853f9aed034d7686d5a11fed9673d6 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 4 Mar 2022 16:55:08 +0800
Subject: [PATCH 128/272] Enable eager model test (#40154)

* enable eager model; test=develop

* set bs = 5; test=develop
---
 .../test_imperative_ocr_attention_model.py    | 26 +++++++++++-
 .../test_imperative_reinforcement.py          | 23 ++++++++++-
 .../unittests/test_imperative_se_resnext.py   | 41 ++++++++++++++++++-
 ..._imperative_transformer_sorted_gradient.py | 36 ++++++++++++++--
 4 files changed, 119 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 973c5598579..09868520b4c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -22,6 +22,7 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Config(object):
@@ -371,7 +372,7 @@ class OCRAttention(fluid.dygraph.Layer):
 
 
 class TestDygraphOCRAttention(unittest.TestCase):
-    def test_while_op(self):
+    def test_ocr_test(self):
         seed = 90
         epoch_num = 1
         if core.is_compiled_with_cuda():
@@ -400,7 +401,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
                 i * Config.max_length,
                 dtype='int64').reshape([1, Config.max_length])))
 
-        with fluid.dygraph.guard():
+        def run_dygraph():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -452,6 +453,16 @@ class TestDygraphOCRAttention(unittest.TestCase):
                     for param in ocr_attention.parameters():
                         dy_param_value[param.name] = param.numpy()
 
+            return dy_out, dy_param_init_value, dy_param_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value = run_dygraph()
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -537,6 +548,17 @@ class TestDygraphOCRAttention(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05))
 
+        # check eager here
+        self.assertTrue(np.allclose(static_out, eager_out))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.array_equal(value, eager_param_init_value[key]))
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(
+                np.allclose(
+                    value, eager_param_value[key], rtol=1e-05))
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index a89628c594d..08320d04d99 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Policy(fluid.dygraph.Layer):
@@ -63,7 +64,7 @@ class TestImperativeMnist(unittest.TestCase):
         mask_list = [[0, 1]]
         mask = np.array(mask_list).astype("float32")
 
-        with fluid.dygraph.guard():
+        def run_dygraph():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
@@ -104,6 +105,16 @@ class TestImperativeMnist(unittest.TestCase):
             for param in policy.parameters():
                 dy_param_value[param.name] = param.numpy()
 
+            return dy_out, dy_param_init_value, dy_param_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value = run_dygraph()
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -171,6 +182,16 @@ class TestImperativeMnist(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.equal(value, dy_param_value[key]).all())
 
+        # check eager
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.equal(value, eager_param_init_value[key]).all())
+
+        self.assertTrue(np.equal(static_out, eager_out).all())
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.equal(value, eager_param_value[key]).all())
+
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 8f8890557ad..3fbb7f4cf7b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -24,6 +24,7 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 if fluid.is_compiled_with_cuda():
     fluid.set_flags({'FLAGS_cudnn_deterministic': True})
@@ -310,7 +311,8 @@ class TestImperativeResneXt(unittest.TestCase):
         batch_size = train_parameters["batch_size"]
         batch_num = 1
         epoch_num = 1
-        with fluid.dygraph.guard():
+
+        def run_dygraph():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
@@ -371,6 +373,17 @@ class TestImperativeResneXt(unittest.TestCase):
                     for param in se_resnext.parameters():
                         dy_param_value[param.name] = param.numpy()
 
+                    return dy_out, dy_param_init_value, dy_param_value, dy_grad_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value, dy_grad_value = run_dygraph(
+            )
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value, eager_grad_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -479,6 +492,32 @@ class TestImperativeResneXt(unittest.TestCase):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+        # check eager
+        self.assertTrue(
+            np.allclose(static_out, eager_out),
+            "\nstatic_out: {}\neager_out: {}".format(static_out, eager_out))
+
+        self.assertEqual(
+            len(eager_param_init_value), len(static_param_init_value))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, eager_param_init_value[key]))
+
+        self.assertEqual(len(eager_grad_value), len(static_grad_value))
+
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(
+                np.allclose(value, eager_grad_value[key]),
+                "\nstatic_grad_value: {}\neager_grad_value: {}".format(
+                    value, eager_grad_value[key]))
+
+        self.assertEqual(len(eager_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(
+                np.allclose(value, eager_param_value[key]),
+                "\nstatic_param_value: {}\neagear_param_value: {}".format(
+                    value, eager_param_value[key]))
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 3f129cae44a..010c8aeccac 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -21,6 +21,7 @@ from paddle.fluid import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph import to_variable, guard
 from paddle.fluid.dygraph import TracedLayer
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid import core
 import numpy as np
 import six
@@ -949,8 +950,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
     def transformer_sort_gradient_float32(self, is_sparse):
         seed = 90
 
-        with guard():
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+        def run_dygraph():
             # NOTE(xiongkun03): In new executor, the inplace strategy is on by default, which will cause result of sumop have some differences. So we disable inplace.
             fluid.set_flags({'FLAGS_new_executor_use_inplace': False})
             paddle.seed(seed)
@@ -998,7 +998,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
 
             for i in range(batch_num):
                 enc_inputs, dec_inputs, label, weights = create_data()
-                if i % 2 == 0:
+                if False:
                     outs, traced_layer = TracedLayer.trace(
                         transformer, [enc_inputs, dec_inputs, label, weights])
 
@@ -1036,6 +1036,14 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
             dy_predict_value = dy_predict.numpy()
             dy_token_num_value = dy_token_num.numpy()
 
+            return dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated
+
+        with guard():
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated = run_dygraph()
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -1122,6 +1130,28 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         for key, value in six.iteritems(static_param_updated):
             self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
+        # check eager result
+        with guard():
+            fluid.set_flags({'FLAGS_sort_sum_gradient': False})
+            dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated = run_dygraph()
+
+        with guard():
+            with _test_eager_guard():
+                eager_avg_cost_value, eager_sum_cost_value, eager_predict_value, eager_token_num_value, \
+                    eager_param_init, eager_param_updated = run_dygraph()
+        self.assertTrue(np.allclose(dy_avg_cost_value, eager_avg_cost_value))
+        self.assertTrue(np.allclose(dy_sum_cost_value, eager_sum_cost_value))
+
+        self.assertTrue(np.allclose(dy_predict_value, eager_predict_value))
+        self.assertTrue(np.allclose(dy_token_num_value, eager_token_num_value))
+
+        for key, value in six.iteritems(static_param_init):
+            self.assertTrue(np.array_equal(value, eager_param_init[key]))
+        for key, value in six.iteritems(dy_param_updated):
+            self.assertTrue(np.allclose(value, eager_param_updated[key]))
+
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From 70540b2684c5bef920f3bd0c445b391ce9f9fb49 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Fri, 4 Mar 2022 17:02:51 +0800
Subject: [PATCH 129/272] [phi] move cpu_vec (#39714)

move cpu_vec.h to phi/kernels/funcs.
---
 paddle/fluid/operators/attention_lstm_op.cc   |  18 +-
 .../fused/fused_embedding_fc_lstm_op.cc       |   6 +-
 .../fused/fusion_seqexpand_concat_fc_op.cc    |   6 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   1 -
 paddle/phi/kernels/funcs/cpu_vec.h            | 675 ++++++++++++++++++
 paddle/phi/tests/kernels/CMakeLists.txt       |   2 +
 .../tests/kernels/test_cpu_vec.cc}            | 112 +--
 7 files changed, 756 insertions(+), 64 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/cpu_vec.h
 rename paddle/{fluid/operators/math/cpu_vec_test.cc => phi/tests/kernels/test_cpu_vec.cc} (75%)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index a23e484d0a8..78ea8b6b6fb 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -269,10 +269,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
   if (bias) {
-    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::avx>(n, y, y);
+    phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y);
+    phi::funcs::vec_relu<T, platform::avx>(n, y, y);
   } else {
-    math::vec_relu<T, platform::avx>(n, x, y);
+    phi::funcs::vec_relu<T, platform::avx>(n, x, y);
   }
 }
 
@@ -283,14 +283,14 @@ inline void vec_softmax(const int n, const T* x, T* y) {
   for (int i = 1; i < n; ++i) {
     scalar = scalar < x[i] ? x[i] : scalar;
   }
-  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                               // exp
+  phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
+  phi::funcs::vec_exp<T>(n, y, y);                               // exp
   // sum
   scalar = T(0);
   for (int i = 0; i < n; ++i) {
     scalar += y[i];
   }
-  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
+  phi::funcs::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }
 
 template <typename T>
@@ -344,12 +344,12 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
     auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
     if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
+      phi::funcs::VecActivations<T, platform::avx> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
     } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 0c83c36b475..7308f307792 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 #include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
@@ -243,12 +243,12 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
   auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
   if (platform::MayIUse(platform::avx)) {                                      \
-    math::VecActivations<T, platform::avx> act_functor;                        \
+    phi::funcs::VecActivations<T, platform::avx> act_functor;                  \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
   } else {                                                                     \
-    math::VecActivations<T, platform::isa_any> act_functor;                    \
+    phi::funcs::VecActivations<T, platform::isa_any> act_functor;              \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 88fb7349d53..1000d0488dc 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -196,10 +196,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
     std::function<void(const int, const T*, T*)> fc_act;
     auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
     if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
+      phi::funcs::VecActivations<T, platform::avx> act_functor;
       fc_act = act_functor(fc_act_str);
     } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
       fc_act = act_functor(fc_act_str);
     }
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index ba047355ad7..14b12ca3acb 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -70,7 +70,6 @@ if(WITH_GPU AND (NOT WITH_ROCM))
     endif()
 endif()
 
-cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if(WITH_TESTING AND TEST im2col_test)
     set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
 endif()
diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h
new file mode 100644
index 00000000000..7bb2a5fcfb3
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cpu_vec.h
@@ -0,0 +1,675 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <functional>
+#include <string>
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+
+#define YMM_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define ZMM_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8
+
+template <typename T>
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+template <>
+inline void vec_exp<float>(const int n, const float* x, float* y) {
+  constexpr int small_enough = 128;
+  if (n < small_enough) {
+    for (int i = 0; i < n; ++i) {
+      y[i] = std::exp(x[i]);
+    }
+  } else {
+    paddle::platform::dynload::vsExp(n, x, y);
+  }
+}
+
+template <>
+inline void vec_exp<double>(const int n, const double* x, double* y) {
+  paddle::platform::dynload::vdExp(n, x, y);
+}
+
+template <>
+inline void vec_scal<float>(const int n, const float a, float* x) {
+  paddle::platform::dynload::cblas_sscal(n, a, x, 1);
+}
+
+template <>
+inline void vec_scal<double>(const int n, const double a, double* x) {
+  paddle::platform::dynload::cblas_dscal(n, a, x, 1);
+}
+#endif
+
+// MKL scal only support inplace, choose this if src and dst are not equal
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx>(const int n,
+                                                   const float a,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_scal<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = _mm256_loadu_ps(x + i);     \
+  tmp = _mm256_mul_ps(tmp, scalar); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx2>(const int n,
+                                                    const float a,
+                                                    const float* x,
+                                                    float* y) {
+  vec_scal<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx512f>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+  // TODO(TJ): enable me
+  vec_scal<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_sum(const size_t n, const T* x, T* s) {
+  s[0] = x[0];
+  for (size_t i = 1; i < n; ++i) {
+    s[0] += x[i];
+  }
+}
+
+template <>
+inline void vec_sum<float, paddle::platform::avx>(const size_t n,
+                                                  const float* x,
+                                                  float* s) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sum<float, paddle::platform::isa_any>(n, x, s);
+    return;
+  }
+
+  unsigned int i, end;
+  i = end = 0;
+  s[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = _mm256_setzero_ps();
+  for (i = 0; i < end; i += block) {
+    tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
+  }
+
+  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
+  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
+  _mm_store_ss(
+      s,
+      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
+
+  for (; i < n; i++) {
+    s[0] += x[i];
+  }
+#else
+  vec_sum<float, paddle::platform::isa_any>(n, x, s);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
+  for (size_t i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul<float, paddle::platform::avx>(const size_t n,
+                                                  const float* x,
+                                                  const float* y,
+                                                  float* z) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul<float, paddle::platform::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  for (i = 0; i < end; i += block) {
+    _mm256_storeu_ps(
+        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
+  }
+
+  for (; i < n; i++) {
+    z[i] = x[i] * y[i];
+  }
+#else
+  vec_mul<float, paddle::platform::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
+  z[0] = x[0] * y[0];
+  for (size_t i = 1; i < n; ++i) {
+    z[0] += x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul_reduce<float, paddle::platform::avx>(const size_t n,
+                                                         const float* x,
+                                                         const float* y,
+                                                         float* z) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  z[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = _mm256_setzero_ps();
+  for (i = 0; i < end; i += block) {
+    tmp = _mm256_add_ps(
+        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
+  }
+
+  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
+  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
+  _mm_store_ss(
+      z,
+      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
+
+  for (; i < n; i++) {
+    z[0] += x[i] * y[i];
+  }
+#else
+  vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx2>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
+  vec_bias_sub<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx512f>(const int n,
+                                                           const float a,
+                                                           const float* x,
+                                                           float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx2>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  vec_cross<float, paddle::platform::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx512f>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, paddle::platform::avx>(n, x, y, z, out);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
+  for (size_t i = 0; i < n; ++i) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+}
+
+template <>
+inline void vec_clip<float, paddle::platform::avx>(const size_t n,
+                                                   const float a,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_clip<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  __m256 threshold = _mm256_set1_ps(a);
+
+  for (i = 0; i < end; i += block) {
+    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
+  }
+
+  for (; i < n; i++) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+#else
+  vec_clip<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx2>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
+  vec_add_bias<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx512f>(const int n,
+                                                           const float a,
+                                                           const float* x,
+                                                           float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx>(const int n,
+                                                      const float* x,
+                                                      float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, paddle::platform::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, min);   \
+  tmp = _mm256_min_ps(tmp, max);   \
+  tmp = _mm256_sub_ps(zeros, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest != 0) {
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
+  }
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = _mm256_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(y + i);   \
+  tmp = _mm256_add_ps(ones, tmp); \
+  tmp = _mm256_div_ps(ones, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, paddle::platform::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx2>(const int n,
+                                                       const float* x,
+                                                       float* y) {
+  vec_sigmoid<float, paddle::platform::avx>(n, x, y);
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx512f>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_sigmoid<float, paddle::platform::avx2>(n, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
+}
+
+// TODO(TJ): make relu clip
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx>(const int n,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block * 4) {
+    vec_relu<float, paddle::platform::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, paddle::platform::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx2>(const int n,
+                                                    const float* x,
+                                                    float* y) {
+  vec_relu<float, paddle::platform::avx>(n, x, y);
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx512f>(const int n,
+                                                       const float* x,
+                                                       float* y) {
+  // TODO(TJ): enable me
+  vec_relu<float, paddle::platform::avx2>(n, x, y);
+}
+
+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Expected type should be one of sigmod, relu, tanh, identity. But got "
+        "not support type: %s.",
+        type));
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index c92e10f8dd7..317dcce92c8 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -22,3 +22,5 @@ endif()
 if(WITH_ROCM)
     hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function)
 endif()
+
+cc_test(test_cpu_vec SRCS test_cpu_vec.cc DEPS blas cpu_info)
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/phi/tests/kernels/test_cpu_vec.cc
similarity index 75%
rename from paddle/fluid/operators/math/cpu_vec_test.cc
rename to paddle/phi/tests/kernels/test_cpu_vec.cc
index 859afec3781..271143f9f6f 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/phi/tests/kernels/test_cpu_vec.cc
@@ -18,7 +18,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
+
+namespace phi {
+namespace tests {
 
 inline double GetCurrentUS() {
   struct timeval time;
@@ -62,7 +65,9 @@ void ref_relu(const int n, const T* x, T* y) {
 }
 
 template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+void RandomVec(const int n,
+               T* a,
+               const T lower = static_cast<T>(-20.f),
                const T upper = static_cast<T>(20.f)) {
   static unsigned int seed = 100;
   std::mt19937 rng(seed++);
@@ -73,7 +78,8 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
 }
 
 template <typename T>
-void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
+void TestAndBench(const int n,
+                  std::function<void(const int, const T*, T*)> tgt,
                   std::function<void(const int, const T*, T*)> ref) {
   std::vector<T> x(n);
   std::vector<T> ytgt(n), yref(n);
@@ -101,47 +107,48 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
 
 TEST(CpuVecTest, sigmoid) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                        ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>);
   }
   TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, tanh) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
-                        ref_tanh<float>);
+    TestAndBench<float>(
+        sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
   }
   TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
 
 TEST(CpuVecTest, relu) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
-                        ref_relu<float>);
+    TestAndBench<float>(
+        sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
 
 template <typename T>
-void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
+void compare_sum(size_t n,
+                 std::function<void(const size_t, const T*, T*)> tgt,
                  std::function<void(const size_t, const T*, T*)> ref) {
   std::vector<T> x(n);
   T ytgt_data, yref_data;
@@ -155,18 +162,19 @@ void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
 
 TEST(CpuVecTest, vec_sum) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     compare_sum<float>(sz, vec_sum<float>, vec_sum<float, platform::isa_any>);
-    compare_sum<float>(sz, vec_sum<float, platform::avx>,
-                       vec_sum<float, platform::isa_any>);
+    compare_sum<float>(
+        sz, vec_sum<float, platform::avx>, vec_sum<float, platform::isa_any>);
   }
   compare_sum<double>(30U, vec_sum<double>, vec_sum<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_clip(
-    size_t n, T threshold,
+    size_t n,
+    T threshold,
     std::function<void(const size_t, const T, const T*, T*)> tgt,
     std::function<void(const size_t, const T, const T*, T*)> ref) {
   std::vector<T> x(n);
@@ -185,20 +193,23 @@ void compare_clip(
 
 TEST(CpuVecTest, vec_clip) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_clip<float>(sz, -4.f, vec_clip<float>,
-                        vec_clip<float, platform::isa_any>);
-    compare_clip<float>(sz, -1.1f, vec_clip<float, platform::avx>,
+    compare_clip<float>(
+        sz, -4.f, vec_clip<float>, vec_clip<float, platform::isa_any>);
+    compare_clip<float>(sz,
+                        -1.1f,
+                        vec_clip<float, platform::avx>,
                         vec_clip<float, platform::isa_any>);
   }
-  compare_clip<double>(30U, 1.0, vec_clip<double>,
-                       vec_clip<double, platform::isa_any>);
+  compare_clip<double>(
+      30U, 1.0, vec_clip<double>, vec_clip<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_mul(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
+    size_t n,
+    std::function<void(const size_t, const T*, const T*, T*)> tgt,
     std::function<void(const size_t, const T*, const T*, T*)> ref) {
   std::vector<T> x(n), y(n);
   std::vector<T> ztgt(n), zref(n);
@@ -220,18 +231,19 @@ void compare_mul(
 
 TEST(CpuVecTest, vec_mul) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
-    compare_mul<float>(sz, vec_mul<float, platform::avx>,
-                       vec_mul<float, platform::isa_any>);
+    compare_mul<float>(
+        sz, vec_mul<float, platform::avx>, vec_mul<float, platform::isa_any>);
   }
   compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_mul_reduce(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
+    size_t n,
+    std::function<void(const size_t, const T*, const T*, T*)> tgt,
     std::function<void(const size_t, const T*, const T*, T*)> ref) {
   std::vector<T> x(n), y(n);
   T ztgt_data, zref_data;
@@ -249,19 +261,21 @@ void compare_mul_reduce(
 
 TEST(CpuVecTest, vec_mul_reduce) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
-                              vec_mul_reduce<float, platform::isa_any>);
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
+    compare_mul_reduce<float>(
+        sz, vec_mul_reduce<float>, vec_mul_reduce<float, platform::isa_any>);
+    compare_mul_reduce<float>(sz,
+                              vec_mul_reduce<float, platform::avx>,
                               vec_mul_reduce<float, platform::isa_any>);
   }
-  compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
-                             vec_mul_reduce<double, platform::isa_any>);
+  compare_mul_reduce<double>(
+      30U, vec_mul_reduce<double>, vec_mul_reduce<double, platform::isa_any>);
 }
 
 template <typename T>
-void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
+void TestInplace(const int n,
+                 std::function<void(const int, const T*, T*)> tgt,
                  std::function<void(const int, const T*, T*)> ref) {
   std::vector<T> x(n);
   std::vector<T> ytgt(n), yref(n);
@@ -283,22 +297,22 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
 
 TEST(CpuVecTest, inplace_sigmoid) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                       ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>);
   }
   TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, inplace_tanh) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
@@ -310,7 +324,7 @@ TEST(CpuVecTest, inplace_tanh) {
 
 TEST(CpuVecTest, inplace_relu) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
@@ -319,3 +333,5 @@ TEST(CpuVecTest, inplace_relu) {
   }
   TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
+}  // namespace tests
+}  // namespace phi
-- 
GitLab


From 1ca379bf27af4bb4044c11e736881ebe9385c9f4 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Fri, 4 Mar 2022 17:04:15 +0800
Subject: [PATCH 130/272] Move gather_nd/scatter/scatter_nd_add op to the phi
 library (#40090)

* move gather_nd/scatter/scatter_nd_add

* fix npu/xpu ci

* follow comments

* small fix
---
 paddle/fluid/operators/gather_nd_op.cc        |  81 +++---------
 paddle/fluid/operators/gather_nd_op.cu        | 109 ----------------
 paddle/fluid/operators/gather_nd_op.h         |  97 ---------------
 paddle/fluid/operators/gather_nd_op_npu.cc    |   5 +-
 paddle/fluid/operators/gather_nd_op_xpu.cc    |  11 +-
 paddle/fluid/operators/scatter_nd_add_op.cc   | 112 +++--------------
 paddle/fluid/operators/scatter_nd_add_op.cu   | 101 ---------------
 paddle/fluid/operators/scatter_nd_add_op.h    |  89 --------------
 paddle/fluid/operators/scatter_op.cc          |  74 ++---------
 paddle/fluid/operators/scatter_op.cu          | 116 ------------------
 paddle/fluid/operators/scatter_op.h           | 113 -----------------
 paddle/fluid/operators/scatter_op_npu.cc      |   1 -
 paddle/fluid/operators/scatter_op_xpu.cc      |   5 +-
 paddle/phi/infermeta/backward.cc              |  45 +++++++
 paddle/phi/infermeta/backward.h               |  14 +++
 paddle/phi/infermeta/binary.cc                |  33 +++++
 paddle/phi/infermeta/binary.h                 |   4 +
 paddle/phi/infermeta/ternary.cc               | 103 ++++++++++++++++
 paddle/phi/infermeta/ternary.h                |  16 +++
 .../phi/kernels/cpu/gather_nd_grad_kernel.cc  |  64 ++++++++++
 paddle/phi/kernels/cpu/gather_nd_kernel.cc    |  60 +++++++++
 paddle/phi/kernels/cpu/scatter_grad_kernel.cc |  73 +++++++++++
 paddle/phi/kernels/cpu/scatter_kernel.cc      |  63 ++++++++++
 .../kernels/cpu/scatter_nd_add_grad_kernel.cc |  55 +++++++++
 .../phi/kernels/cpu/scatter_nd_add_kernel.cc  |  60 +++++++++
 paddle/phi/kernels/gather_nd_grad_kernel.h    |  28 +++++
 paddle/phi/kernels/gather_nd_kernel.h         |  27 ++++
 .../phi/kernels/gpu/gather_nd_grad_kernel.cu  |  65 ++++++++++
 paddle/phi/kernels/gpu/gather_nd_kernel.cu    |  60 +++++++++
 paddle/phi/kernels/gpu/scatter_grad_kernel.cu |  74 +++++++++++
 paddle/phi/kernels/gpu/scatter_kernel.cu      |  62 ++++++++++
 .../kernels/gpu/scatter_nd_add_grad_kernel.cu |  55 +++++++++
 .../phi/kernels/gpu/scatter_nd_add_kernel.cu  |  58 +++++++++
 paddle/phi/kernels/scatter_grad_kernel.h      |  29 +++++
 paddle/phi/kernels/scatter_kernel.h           |  29 +++++
 .../phi/kernels/scatter_nd_add_grad_kernel.h  |  29 +++++
 paddle/phi/kernels/scatter_nd_add_kernel.h    |  28 +++++
 paddle/phi/ops/compat/gather_scatter_sig.cc   |  46 +++++++
 38 files changed, 1241 insertions(+), 853 deletions(-)
 delete mode 100644 paddle/fluid/operators/gather_nd_op.cu
 delete mode 100644 paddle/fluid/operators/gather_nd_op.h
 delete mode 100644 paddle/fluid/operators/scatter_nd_add_op.cu
 delete mode 100644 paddle/fluid/operators/scatter_nd_add_op.h
 delete mode 100644 paddle/fluid/operators/scatter_op.cu
 delete mode 100644 paddle/fluid/operators/scatter_op.h
 create mode 100644 paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/gather_nd_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/scatter_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/scatter_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
 create mode 100644 paddle/phi/kernels/gather_nd_grad_kernel.h
 create mode 100644 paddle/phi/kernels/gather_nd_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/gather_nd_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/scatter_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/scatter_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
 create mode 100644 paddle/phi/kernels/scatter_grad_kernel.h
 create mode 100644 paddle/phi/kernels/scatter_kernel.h
 create mode 100644 paddle/phi/kernels/scatter_nd_add_grad_kernel.h
 create mode 100644 paddle/phi/kernels/scatter_nd_add_kernel.h
 create mode 100644 paddle/phi/ops/compat/gather_scatter_sig.cc

diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index 8da900d84f9..fcd3384ac24 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/phi/core/ddim.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,48 +25,10 @@ class GatherNdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherNdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherNdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherNdOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_size = x_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], x_dims_size,
-        platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-    PADDLE_ENFORCE_GE(index_dims_size, 1UL,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
-
-    std::vector<int64_t> result_dims;
-    // The result dims is
-    //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
-    for (int i = 0; i < index_dims_size - 1; ++i) {
-      result_dims.emplace_back(index_dims[i]);
-    }
-    for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
-      result_dims.emplace_back(x_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(result_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<framework::Tensor>("X");
     const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     return framework::OpKernelType(
         x_type,
@@ -80,11 +42,6 @@ class GatherNdGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,23 +130,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X");
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
+                            PT_INFER_META(phi::GatherNdInferMeta));
+
+DELCARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
+                            PT_INFER_META(phi::GatherNdGradInferMeta));
+
 REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
                   ops::GatherNdGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherNdGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherNdGradOpMaker<paddle::imperative::OpBase>,
+                  GatherNdInferShapeFunctor);
 
 REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp,
-                  ops::GatherNdGradNoNeedBufferVarInferer);
-
-REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel<float>,
-                       ops::GatherNdOpKernel<double>,
-                       ops::GatherNdOpKernel<int64_t>,
-                       ops::GatherNdOpKernel<int>,
-                       ops::GatherNdOpKernel<int16_t>,
-                       ops::GatherNdOpKernel<bool>,
-                       ops::GatherNdOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel<float>,
-                       ops::GatherNdGradOpKernel<double>,
-                       ops::GatherNdGradOpKernel<int64_t>,
-                       ops::GatherNdGradOpKernel<int>,
-                       ops::GatherNdGradOpKernel<uint8_t>);
+                  ops::GatherNdGradNoNeedBufferVarInferer,
+                  GatherNdGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
deleted file mode 100644
index 338c4411618..00000000000
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    const auto &index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s], but "
-            "desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUGatherNd<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUScatterNdAdd<T, int>(dev_ctx, *dO, *index, dX);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<float>,
-                        ops::GatherNdOpCUDAKernel<double>,
-                        ops::GatherNdOpCUDAKernel<int64_t>,
-                        ops::GatherNdOpCUDAKernel<int>,
-                        ops::GatherNdOpCUDAKernel<int16_t>,
-                        ops::GatherNdOpCUDAKernel<bool>,
-                        ops::GatherNdOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel<float>,
-                        ops::GatherNdGradOpCUDAKernel<double>,
-                        ops::GatherNdGradOpCUDAKernel<int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<int>,
-                        ops::GatherNdGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
deleted file mode 100644
index d54261008e4..00000000000
--- a/paddle/fluid/operators/gather_nd_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherNdOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-
-    auto index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::CPUGatherNd<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    auto index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
index 995ab5d0ddf..c916f44b874 100644
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc
index 9f4c522bd14..d4cb799e825 100644
--- a/paddle/fluid/operators/gather_nd_op_xpu.cc
+++ b/paddle/fluid/operators/gather_nd_op_xpu.cc
@@ -11,7 +11,10 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -20,9 +23,9 @@ template <typename T>
 class GatherNdXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<framework::Tensor>("X");
+    auto *index = ctx.Input<framework::Tensor>("Index");
+    auto *out = ctx.Output<framework::Tensor>("Out");
 
     out->template mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index bb02bb541e1..b7be4cfb2a3 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,73 +27,6 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Index"), true,
-        platform::errors::InvalidArgument(
-            "Input(Index) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Updates"), true,
-        platform::errors::InvalidArgument(
-            "Input(Updates) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of ScatterNdAddOp should not be null."));
-
-    auto ref_dims = ctx->GetInputDim("X");
-    auto ref_dims_size = ref_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto updates_dims_size = updates_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], ref_dims_size,
-        platform::errors::InvalidArgument(
-            "The last dimension of Input(Index)'s shape should be no greater "
-            "than the rank of Input(X), but received the last dimension of "
-            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
-            index_dims[index_dims_size - 1], ref_dims_size));
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1, "
-                          "but received the rank of Input(Index) is %d.",
-                          index_dims_size));
-
-    // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
-    std::vector<int64_t> r_updates_dims;
-    for (int64_t i = 0; i < index_dims_size - 1; ++i) {
-      r_updates_dims.emplace_back(index_dims[i]);
-    }
-    for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
-      r_updates_dims.emplace_back(ref_dims[i]);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument(
-            "Updates has wrong shape. The shape of Updates and Input(Updates) "
-            "should be same, but received the shape of Updates is %d, "
-            "the shape of Input(Updates) is %d.",
-            r_updates_dims.size(), updates_dims_size));
-
-    for (int64_t i = 0; i < updates_dims_size; ++i) {
-      PADDLE_ENFORCE_EQ(
-          r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument(
-              "Updates has wrong shape. The dimensions of Updates and "
-              "Input(Updates) should match, but received Updates's"
-              "%d-th dimension is %d, Input(Updates)'s %d-th "
-              "dimension is %d.",
-              i, r_updates_dims[i], i, updates_dims[i]));
-    }
-    ctx->SetOutputDim("Out", ref_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,7 +35,8 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
                       platform::errors::InvalidArgument(
                           "Ref and Updates must have same type"));
     return framework::OpKernelType(
-        framework::TransToProtoVarType(ctx.Input<Tensor>("X")->type()),
+        framework::TransToProtoVarType(
+            ctx.Input<framework::Tensor>("X")->type()),
         ctx.device_context());
   }
 };
@@ -108,17 +45,6 @@ class ScatterNdAddGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -193,22 +119,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
+                            PT_INFER_META(phi::ScatterNdAddInferMeta));
+
+DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
+                            ScatterNdAddGradInferShapeFunctor,
+                            PT_INFER_META(phi::ScatterNdAddGradInferMeta));
+
 REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker,
                   ops::ScatterNdAddGradMaker<paddle::framework::OpDesc>,
-                  ops::ScatterNdAddGradMaker<paddle::imperative::OpBase>);
+                  ops::ScatterNdAddGradMaker<paddle::imperative::OpBase>,
+                  ScatterNdAddInferShapeFunctor);
 
 REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp,
-                  ops::ScatterNdAddGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel<float>,
-                       ops::ScatterNdAddOpKernel<double>,
-                       ops::ScatterNdAddOpKernel<int64_t>,
-                       ops::ScatterNdAddOpKernel<int>,
-                       ops::ScatterNdAddOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad,
-                       ops::ScatterNdAddGradientOpKernel<float>,
-                       ops::ScatterNdAddGradientOpKernel<double>,
-                       ops::ScatterNdAddGradientOpKernel<int64_t>,
-                       ops::ScatterNdAddGradientOpKernel<int>,
-                       ops::ScatterNdAddGradientOpKernel<uint8_t>);
+                  ops::ScatterNdAddGradNoNeedBufferVarsInferer,
+                  ScatterNdAddGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
deleted file mode 100644
index 2fe3fcb759d..00000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s], but "
-            "desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-    } else {
-      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      auto &dev_ctx = ctx.cuda_device_context();
-      // Gradient by Gather
-      const auto &index_type = Ids->dtype();
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
deleted file mode 100644
index 81c95fe55ab..00000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterNdAddOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    // In place output: Out = X
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s], but "
-            "desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-    } else {
-      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename T>
-class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = Ids->dtype();
-      auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::CPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3174f07e96e..fec003305fd 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,46 +26,6 @@ class ScatterOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Updates) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of ScatterOp should not be null."));
-
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of Input(Ids)'s shape should be equal to 1, but "
-            "received the rank of Input(Ids) is %d.",
-            ctx->GetInputDim("Ids").size()));
-    PADDLE_ENFORCE_EQ(
-        ref_dims.size(), updates_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Updates) should have the same shape size, "
-            "but received the size of Input(x)'s shape is %d, the size of "
-            "Input(Updates)'s shape is %d.",
-            ref_dims.size(), updates_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
-        platform::errors::InvalidArgument(
-            "Input(Updates) and Input(Ids) should have same batch-size, but"
-            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
-            "batch-size is %d.",
-            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
-    ctx->SetOutputDim("Out", ref_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -76,17 +39,6 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -151,17 +103,17 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
+                            PT_INFER_META(phi::ScatterInferMeta));
+
+DELCARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
+                            PT_INFER_META(phi::ScatterGradInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
                   ops::ScatterGradMaker<paddle::framework::OpDesc>,
                   ops::ScatterGradMaker<paddle::imperative::OpBase>,
-                  ops::ScatterInplaceInferer);
+                  ops::ScatterInplaceInferer, ScatterInferShapeFunctor);
 REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
-                  ops::ScatterGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>,
-                       ops::ScatterOpKernel<double>, ops::ScatterOpKernel<int>,
-                       ops::ScatterOpKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>,
-                       ops::ScatterGradientOpKernel<double>,
-                       ops::ScatterGradientOpKernel<int>,
-                       ops::ScatterGradientOpKernel<int64_t>);
+                  ops::ScatterGradNoNeedBufferVarsInferer,
+                  ScatterGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
deleted file mode 100644
index 7755e376bc1..00000000000
--- a/paddle/fluid/operators/scatter_op.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    framework::TensorCopy(*X, ctx.GetPlace(), Out);
-    // use template class to support int32_t and int64_t
-    auto index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out,
-                                               overwrite);
-    } else {
-      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out,
-                                               overwrite);
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
-      } else {
-        phi::funcs::GPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
-      }
-    }
-
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::GPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>,
-                        ops::ScatterOpCUDAKernel<double>,
-                        ops::ScatterOpCUDAKernel<int>,
-                        ops::ScatterOpCUDAKernel<int64_t>,
-                        ops::ScatterOpCUDAKernel<paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    scatter_grad, ops::ScatterGradOpCUDAKernel<float>,
-    ops::ScatterGradOpCUDAKernel<double>, ops::ScatterOpCUDAKernel<int>,
-    ops::ScatterOpCUDAKernel<int64_t>,
-    ops::ScatterGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
deleted file mode 100644
index 7733181a93f..00000000000
--- a/paddle/fluid/operators/scatter_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    double overwrite = ctx.Attr<bool>("overwrite");
-
-    // In place output: Out = X, Out[Ids] = Updates
-    framework::TensorCopy(*X, ctx.GetPlace(), Out);
-    // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (overwrite) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-      } else {
-        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-      }
-    } else {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-      }
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::CPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
-      } else {
-        phi::funcs::CPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
-      }
-    }
-
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::CPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::CPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index fa5f03a0928..815984ac307 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/operators/scatter_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
index 9f0b74e8a3f..07dd2f2d85f 100644
--- a/paddle/fluid/operators/scatter_op_xpu.cc
+++ b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 7d403fee943..4ddef5b0002 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -105,4 +105,49 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
   dx->share_meta(dout);
 }
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad) {
+  const auto& dtype = out_grad.dtype();
+  x_grad->set_dims(x.dims());
+  x_grad->share_lod(x);
+  x_grad->set_dtype(dtype);
+}
+
+void ScatterGradInferMeta(const MetaTensor& index,
+                          const MetaTensor& updates,
+                          const MetaTensor& out_grad,
+                          bool overwrite,
+                          MetaTensor* x_grad,
+                          MetaTensor* updates_grad) {
+  const auto& dtype = out_grad.dtype();
+  if (updates_grad) {
+    updates_grad->set_dims(updates.dims());
+    updates_grad->set_dtype(dtype);
+  }
+
+  if (x_grad) {
+    x_grad->set_dims(out_grad.dims());
+    x_grad->set_dtype(dtype);
+  }
+}
+
+void ScatterNdAddGradInferMeta(const MetaTensor& index,
+                               const MetaTensor& updates,
+                               const MetaTensor& out_grad,
+                               MetaTensor* x_grad,
+                               MetaTensor* updates_grad) {
+  const auto& dtype = out_grad.dtype();
+  if (updates_grad) {
+    updates_grad->set_dims(updates.dims());
+    updates_grad->set_dtype(dtype);
+  }
+
+  if (x_grad) {
+    x_grad->set_dims(out_grad.dims());
+    x_grad->set_dtype(dtype);
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index f2c0cf8a689..f7b0eed5dd9 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -46,4 +46,18 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
                                 MetaTensor* dx);
+
+void ScatterGradInferMeta(const MetaTensor& index,
+                          const MetaTensor& updates,
+                          const MetaTensor& out_grad,
+                          bool overwrite,
+                          MetaTensor* x_grad,
+                          MetaTensor* updates_grad);
+
+void ScatterNdAddGradInferMeta(const MetaTensor& index,
+                               const MetaTensor& updates,
+                               const MetaTensor& out_grad,
+                               MetaTensor* x_grad,
+                               MetaTensor* updates_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 1f6f0b211b6..745ddffabbe 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -397,6 +397,39 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto x_dims_size = x_dims.size();
+  auto index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      x_dims_size,
+      phi::errors::InvalidArgument(
+          "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1"));
+
+  std::vector<int64_t> result_dims;
+  // The result dims is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    result_dims.emplace_back(index_dims[i]);
+  }
+  for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
+    result_dims.emplace_back(x_dims[i]);
+  }
+
+  out->set_dims(phi::make_ddim(result_dims));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out) {
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 47745f8ce13..2ec74463698 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -78,6 +78,10 @@ void BCELossInferMeta(const MetaTensor& input,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out);
+
 void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out);
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 1c1497fb0e4..c3472a24801 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -89,6 +89,109 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out) {
+  const auto& updates_dims = updates.dims();
+  const auto& ref_dims = x.dims();
+  const auto& index_dims = index.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The size of Input(Ids)'s shape should be equal to 1, but "
+          "received the rank of Input(Ids) is %d.",
+          index_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      ref_dims.size(),
+      updates_dims.size(),
+      phi::errors::InvalidArgument(
+          "Input(X) and Input(Updates) should have the same shape size, "
+          "but received the size of Input(x)'s shape is %d, the size of "
+          "Input(Updates)'s shape is %d.",
+          ref_dims.size(),
+          updates_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      updates_dims[0],
+      index_dims[0],
+      phi::errors::InvalidArgument(
+          "Input(Updates) and Input(Ids) should have same batch-size, but"
+          " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+          "batch-size is %d.",
+          updates_dims[0],
+          index_dims[0]));
+  out->set_dims(ref_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void ScatterNdAddInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& updates,
+                           MetaTensor* out) {
+  const auto& ref_dims = x.dims();
+  auto ref_dims_size = ref_dims.size();
+  const auto& index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+  const auto& updates_dims = updates.dims();
+  auto updates_dims_size = updates_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      ref_dims_size,
+      phi::errors::InvalidArgument(
+          "The last dimension of Input(Index)'s shape should be no greater "
+          "than the rank of Input(X), but received the last dimension of "
+          "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+          index_dims[index_dims_size - 1],
+          ref_dims_size));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    2UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1, "
+                        "but received the rank of Input(Index) is %d.",
+                        index_dims_size));
+
+  // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
+  std::vector<int64_t> r_updates_dims;
+  for (int64_t i = 0; i < index_dims_size - 1; ++i) {
+    r_updates_dims.emplace_back(index_dims[i]);
+  }
+  for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
+    r_updates_dims.emplace_back(ref_dims[i]);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      r_updates_dims.size(),
+      updates_dims_size,
+      phi::errors::InvalidArgument(
+          "Updates has wrong shape. The shape of Updates and Input(Updates) "
+          "should be same, but received the shape of Updates is %d, "
+          "the shape of Input(Updates) is %d.",
+          r_updates_dims.size(),
+          updates_dims_size));
+
+  for (int64_t i = 0; i < updates_dims_size; ++i) {
+    PADDLE_ENFORCE_EQ(
+        r_updates_dims[i],
+        updates_dims[i],
+        phi::errors::InvalidArgument(
+            "Updates has wrong shape. The dimensions of Updates and "
+            "Input(Updates) should match, but received Updates's"
+            "%d-th dimension is %d, Input(Updates)'s %d-th "
+            "dimension is %d.",
+            i,
+            r_updates_dims[i],
+            i,
+            updates_dims[i]));
+  }
+  out->set_dims(ref_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 5679c5b533f..cff57e1ba70 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -37,6 +37,22 @@ void AddmmInferMeta(const MetaTensor& input,
                     float beta,
                     MetaTensor* out);
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad);
+
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out);
+
+void ScatterNdAddInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& updates,
+                           MetaTensor* out);
+
 void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    const MetaTensor& weight,
diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
new file mode 100644
index 00000000000..b375a7ec469
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto &place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::ScatterNdAdd<T, int32_t>(ctx, out_grad, index, x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::ScatterNdAdd<T, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
new file mode 100644
index 00000000000..aa32d036934
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::CPUGatherNd<T, int>(ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::CPUGatherNd<T, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
new file mode 100644
index 00000000000..62fd58704c4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad) {
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s]",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (x_grad) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUScatterGradForX<T, int32_t>(ctx, index, x_grad);
+    } else {
+      phi::funcs::CPUScatterGradForX<T, int64_t>(ctx, index, x_grad);
+    }
+  }
+
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::CPUGather<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc
new file mode 100644
index 00000000000..d48ceaf29a0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out) {
+  // In place output: Out = X, Out[Ids] = Updates
+  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  // Apply ScatterUpdate: Out[index] = Updates[:]
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s].",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+  if (overwrite) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterAssign<T, int32_t>(ctx, updates, index, out);
+    } else {
+      phi::funcs::ScatterAssign<T, int64_t>(ctx, updates, index, out);
+    }
+  } else {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterAssignAdd<T, int32_t>(ctx, updates, index, out);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int64_t>(ctx, updates, index, out);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    scatter, CPU, ALL_LAYOUT, phi::ScatterKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
new file mode 100644
index 00000000000..cc143ba8d0e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad) {
+  if (x_grad) {
+    Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  }
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    const auto &index_type = index.dtype();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGatherNd<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::CPUGatherNd<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
new file mode 100644
index 00000000000..04ae10f5e8b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out) {
+  // In place output: Out = X
+  Copy(ctx, x, ctx.GetPlace(), true, out);
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::ScatterNdAdd<T, int32_t>(ctx, updates, index, out);
+  } else {
+    phi::funcs::ScatterNdAdd<T, int64_t>(ctx, updates, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gather_nd_grad_kernel.h b/paddle/phi/kernels/gather_nd_grad_kernel.h
new file mode 100644
index 00000000000..05003471495
--- /dev/null
+++ b/paddle/phi/kernels/gather_nd_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradNdKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_nd_kernel.h b/paddle/phi/kernels/gather_nd_kernel.h
new file mode 100644
index 00000000000..d2393eb3b07
--- /dev/null
+++ b/paddle/phi/kernels/gather_nd_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
new file mode 100644
index 00000000000..5273902804a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto &place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s].",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterNdAdd<T, int>(ctx, out_grad, index, x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUScatterNdAdd<T, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
new file mode 100644
index 00000000000..33745ef5f07
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_nd_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUGatherNd<T, int>(ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUGatherNd<T, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
new file mode 100644
index 00000000000..75506e2a0a1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s]",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (x_grad) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterGradForX<T, int32_t>(ctx, index, x_grad);
+    } else {
+      phi::funcs::GPUScatterGradForX<T, int64_t>(ctx, index, x_grad);
+    }
+  }
+
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGather<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::GPUGather<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_kernel.cu b/paddle/phi/kernels/gpu/scatter_kernel.cu
new file mode 100644
index 00000000000..811eae1bc02
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_kernel.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out) {
+  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  // use template class to support int32_t and int64_t
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op Index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterAssign<T, int32_t>(
+        ctx, updates, index, out, overwrite);
+  } else {
+    phi::funcs::GPUScatterAssign<T, int64_t>(
+        ctx, updates, index, out, overwrite);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
new file mode 100644
index 00000000000..71924befe8c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad) {
+  if (x_grad) {
+    Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  }
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather
+    const auto &index_type = index.dtype();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGatherNd<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::GPUGatherNd<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
new file mode 100644
index 00000000000..eadd91773c0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out) {
+  Copy(ctx, x, ctx.GetPlace(), true, out);
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterNdAdd<T, int32_t>(ctx, updates, index, out);
+  } else {
+    phi::funcs::GPUScatterNdAdd<T, int64_t>(ctx, updates, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/scatter_grad_kernel.h b/paddle/phi/kernels/scatter_grad_kernel.h
new file mode 100644
index 00000000000..cf1482fca7f
--- /dev/null
+++ b/paddle/phi/kernels/scatter_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_kernel.h b/paddle/phi/kernels/scatter_kernel.h
new file mode 100644
index 00000000000..5191d6bce45
--- /dev/null
+++ b/paddle/phi/kernels/scatter_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_nd_add_grad_kernel.h b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h
new file mode 100644
index 00000000000..bcfdb2cdb2f
--- /dev/null
+++ b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_nd_add_kernel.h b/paddle/phi/kernels/scatter_nd_add_kernel.h
new file mode 100644
index 00000000000..c20709dccc0
--- /dev/null
+++ b/paddle/phi/kernels/scatter_nd_add_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/gather_scatter_sig.cc b/paddle/phi/ops/compat/gather_scatter_sig.cc
new file mode 100644
index 00000000000..f71e30f85b0
--- /dev/null
+++ b/paddle/phi/ops/compat/gather_scatter_sig.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GatherNdGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gather_nd_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {},
+                         {GradVarName("X")});
+}
+
+KernelSignature ScatterGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("scatter_grad",
+                         {"Ids", "Updates", GradVarName("Out")},
+                         {"overwrite"},
+                         {GradVarName("X"), GradVarName("Updates")});
+}
+
+KernelSignature ScatterNdAddGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("scatter_nd_add_grad",
+                         {"Index", "Updates", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Updates")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gather_nd_grad, phi::GatherNdGradArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scatter_grad, phi::ScatterGradArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add_grad,
+                           phi::ScatterNdAddGradArgumentMapping);
-- 
GitLab


From 5496a7ab3beb60e908a2deb6eb5bca9834ac7a8b Mon Sep 17 00:00:00 2001
From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com>
Date: Fri, 4 Mar 2022 17:20:26 +0800
Subject: [PATCH 131/272] Dump cpu xingneng (#40068)

* dump cpu

* code format
---
 .../framework/fleet/heter_ps/hashtable_inl.h  | 76 ++++++++++++-------
 1 file changed, 47 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index b7cb2ce0f01..59220fc9cda 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -186,45 +186,63 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
   container_->prefetch(cudaCpuDeviceId, stream);
+  std::vector<std::thread> threads;
   size_t num = container_->size();
   KeyType unuse_key = std::numeric_limits<KeyType>::max();
   thrust::pair<KeyType, ValType>* kv = container_->data();
-  for (size_t i = 0; i < num; ++i) {
-    if (kv[i].first == unuse_key) {
-      continue;
-    }
-    ValType& gpu_val = kv[i].second;
+
+  int thread_num = 8;
+  int len_per_thread = num / thread_num;
+  int remain = num % thread_num;
+  int begin = 0;
+
+  auto dump_func = [unuse_key, kv](int left, int right) {
+    for (int i = left; i < right; i++) {
+      if (kv[i].first == unuse_key) {
+        continue;
+      }
+      ValType& gpu_val = kv[i].second;
 #ifdef PADDLE_WITH_PSLIB
-    auto* downpour_value =
-        (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
-    int downpour_value_size = downpour_value->size();
-    if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
-      downpour_value->resize(gpu_val.mf_size + downpour_value_size);
-    }
-    float* cpu_val = downpour_value->data();
-    // cpu_val[0] = 0;
-    cpu_val[1] = gpu_val.delta_score;
-    cpu_val[2] = gpu_val.show;
-    cpu_val[3] = gpu_val.clk;
-    cpu_val[4] = gpu_val.lr;
-    cpu_val[5] = gpu_val.lr_g2sum;
-    cpu_val[6] = gpu_val.slot;
-    if (gpu_val.mf_size > 0) {
-      for (int x = 0; x < gpu_val.mf_size; x++) {
-        cpu_val[x + 7] = gpu_val.mf[x];
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
+        downpour_value->resize(gpu_val.mf_size + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      // cpu_val[0] = 0;
+      cpu_val[1] = gpu_val.delta_score;
+      cpu_val[2] = gpu_val.show;
+      cpu_val[3] = gpu_val.clk;
+      cpu_val[4] = gpu_val.lr;
+      cpu_val[5] = gpu_val.lr_g2sum;
+      cpu_val[6] = gpu_val.slot;
+      if (gpu_val.mf_size > 0) {
+        for (int x = 0; x < gpu_val.mf_size; x++) {
+          cpu_val[x + 7] = gpu_val.mf[x];
+        }
       }
-    }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-    auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
-    downpour_value->count_ = gpu_val.show;
-    for (int x = 0; x < gpu_val.mf_size; x++) {
-      downpour_value->data_[x] = gpu_val.mf[x];
-    }
+      auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
+      downpour_value->count_ = gpu_val.show;
+      for (int x = 0; x < gpu_val.mf_size; x++) {
+        downpour_value->data_[x] = gpu_val.mf[x];
+      }
 #endif
+    }
+  };
+
+  for (int i = 0; i < thread_num; i++) {
+    threads.push_back(std::thread(
+        dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0)));
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
   }
 
-  container_->prefetch(devid, stream);
+  // container_->prefetch(devid, stream);
 }
 
 template <typename KeyType, typename ValType>
-- 
GitLab


From 0f9259684ac40d6dbd429ccd6dd43d5fbc9cf27d Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 4 Mar 2022 17:27:54 +0800
Subject: [PATCH 132/272] Fixed GradNode default attributes issues (#40132)

* Fixed GradNode default attributes issues

* Reverted changes on yaml files
---
 .../final_state_generator/eager_gen.py          | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 7de7747ebf0..d1e20854153 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -509,11 +509,18 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
         set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format(
             aname, GetConstReference(atype), aname, saved_attr_name, aname)
 
-        ATTRIBUTE_MEMBER_TEMPLATE = """
-   {} {} = {};
-"""
-        attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
-            RemoveConstAndReference(atype), saved_attr_name, default_val)
+        if default_val:
+            ATTRIBUTE_MEMBER_TEMPLATE = """
+       {} {} = {};
+    """
+            attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
+                RemoveConstAndReference(atype), saved_attr_name, default_val)
+        else:
+            ATTRIBUTE_MEMBER_TEMPLATE = """
+       {} {};
+    """
+            attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
+                RemoveConstAndReference(atype), saved_attr_name)
     # End: SetAttributes & Attribute Members
 
     grad_node_name = GetGradNodeName(fwd_api_name)
-- 
GitLab


From 5dc766371cfe6d1fdc16ece613799f7b4f734eae Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 4 Mar 2022 18:42:29 +0800
Subject: [PATCH 133/272] change namespace to fix conflict (#40164)

---
 .../gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu        | 2 +-
 .../phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index ae3cefd9e82..598b0138fb3 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
       reduce_dim.push_back(i);
     }
 
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+    funcs::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
         dev_ctx,
         *counts_tensor,
         norm_tensor,
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index fb63badf56a..13d63f8d97e 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
       reduce_dim.push_back(i);
     }
 
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+    funcs::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
         dev_ctx,
         *counts_tensor,
         norm_tensor,
-- 
GitLab


From 12346cdce77fb97af708d6b8f04ff9231a311229 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 4 Mar 2022 20:07:16 +0800
Subject: [PATCH 134/272] [PHI] Move header of selected_rows kernel to
 selected_rows dir (#40128)

* move selected_rows kernel head to selected_rows dir

* update license

* add sr namespace

* refacter selected_rows kernel funciton name

* fix bug
---
 paddle/phi/kernels/full_kernel.h              |  8 ---
 paddle/phi/kernels/scale_kernel.h             |  9 ----
 .../phi/kernels/selected_rows/full_kernel.cc  | 19 ++++---
 .../phi/kernels/selected_rows/full_kernel.h   | 32 ++++++++++++
 .../phi/kernels/selected_rows/scale_kernel.cc | 21 ++++----
 .../phi/kernels/selected_rows/scale_kernel.h  | 32 ++++++++++++
 .../selected_rows/uniform_random_kernel.cc    | 50 +++++++++++--------
 .../selected_rows/uniform_random_kernel.h     | 45 +++++++++++++++++
 paddle/phi/kernels/uniform_random_kernel.h    | 22 --------
 9 files changed, 160 insertions(+), 78 deletions(-)
 create mode 100644 paddle/phi/kernels/selected_rows/full_kernel.h
 create mode 100644 paddle/phi/kernels/selected_rows/scale_kernel.h
 create mode 100644 paddle/phi/kernels/selected_rows/uniform_random_kernel.h

diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index c7b1f9af0e3..05929ba83f3 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -31,13 +30,6 @@ void FullKernel(const Context& dev_ctx,
                 DataType dtype,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out);
-
 template <typename T, typename Context>
 void FullLikeKernel(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h
index b3958343698..22e6efb03ac 100644
--- a/paddle/phi/kernels/scale_kernel.h
+++ b/paddle/phi/kernels/scale_kernel.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 namespace phi {
@@ -29,14 +28,6 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
-template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out);
-
 template <typename T, typename Context>
 DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 02231867fdd..39fd009cd65 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -12,34 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out) {
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out) {
   phi::FullKernel<T>(dev_ctx, shape, val, dtype, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
@@ -56,7 +59,7 @@ PD_REGISTER_KERNEL(full_sr,
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h
new file mode 100644
index 00000000000..d84ddcc0d3f
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/full_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 094b6f4d120..38a0cb75101 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/phi/kernels/selected_rows/scale_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out) {
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out) {
   if (x.value().Holder() != out->value().Holder() ||
       x.value().data() != out->value().data()) {
     out->set_rows(x.rows());
@@ -36,12 +38,13 @@ void ScaleSR(const Context& dev_ctx,
       dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::bfloat16,
@@ -55,7 +58,7 @@ PD_REGISTER_KERNEL(scale_sr,
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.h b/paddle/phi/kernels/selected_rows/scale_kernel.h
new file mode 100644
index 00000000000..85c2c4ddff0
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
index 881180b71b1..b3dd1d1b7d2 100644
--- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
@@ -12,22 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/kernels/selected_rows/uniform_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/uniform_random_kernel.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void UniformRandomRawSRKernel(const Context& dev_ctx,
-                              const ScalarArray& shape,
-                              DataType dtype,
-                              float min,
-                              float max,
-                              int seed,
-                              int diag_num,
-                              int diag_step,
-                              float diag_val,
-                              SelectedRows* out) {
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out) {
   phi::UniformRandomRawKernel<T>(dev_ctx,
                                  shape,
                                  dtype,
@@ -41,23 +46,24 @@ void UniformRandomRawSRKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void UniformRandomSRKernel(const Context& dev_ctx,
-                           const ScalarArray& shape,
-                           DataType dtype,
-                           float min,
-                           float max,
-                           int seed,
-                           SelectedRows* out) {
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out) {
   phi::UniformRandomKernel<T>(
       dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(uniform_random_raw_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomRawSRKernel,
+                   phi::sr::UniformRandomRawKernel,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
@@ -65,7 +71,7 @@ PD_REGISTER_KERNEL(uniform_random_raw_sr,
 PD_REGISTER_KERNEL(uniform_random_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomSRKernel,
+                   phi::sr::UniformRandomKernel,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
@@ -75,14 +81,14 @@ PD_REGISTER_KERNEL(uniform_random_sr,
 PD_REGISTER_KERNEL(uniform_random_raw_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomRawSRKernel,
+                   phi::sr::UniformRandomRawKernel,
                    float,
                    double) {}
 
 PD_REGISTER_KERNEL(uniform_random_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomSRKernel,
+                   phi::sr::UniformRandomKernel,
                    float,
                    double) {}
 #endif
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
new file mode 100644
index 00000000000..aee7a4c7aaf
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h
index 5bba1272785..36ce4c3f9ee 100644
--- a/paddle/phi/kernels/uniform_random_kernel.h
+++ b/paddle/phi/kernels/uniform_random_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/selected_rows.h"
 
 namespace phi {
 
@@ -42,25 +41,4 @@ void UniformRandomKernel(const Context& dev_ctx,
                          int seed,
                          DenseTensor* out);
 
-template <typename T, typename Context>
-void UniformRandomRawSRKernel(const Context& dev_ctx,
-                              const ScalarArray& shape,
-                              DataType dtype,
-                              float min,
-                              float max,
-                              int seed,
-                              int diag_num,
-                              int diag_step,
-                              float diag_val,
-                              SelectedRows* out);
-
-template <typename T, typename Context>
-void UniformRandomSRKernel(const Context& dev_ctx,
-                           const ScalarArray& shape,
-                           DataType dtype,
-                           float min,
-                           float max,
-                           int seed,
-                           SelectedRows* out);
-
 }  // namespace phi
-- 
GitLab


From faece3829ae14c9256313f3ff31ce9adfd41ddf9 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 4 Mar 2022 22:09:40 +0800
Subject: [PATCH 135/272] Move yolo box to phi (#40112)

* add yolo box kernel; test=develop

* fix comile error; test=develop
---
 .../tensorrt/plugin/yolo_box_op_plugin.cu     |   1 -
 .../fluid/operators/detection/CMakeLists.txt  |   2 +-
 .../fluid/operators/detection/yolo_box_op.cc  |   3 -
 .../fluid/operators/detection/yolo_box_op.cu  | 143 --------------
 .../fluid/operators/detection/yolo_box_op.h   | 180 -----------------
 paddle/phi/kernels/cpu/yolo_box_kernel.cc     | 128 ++++++++++++
 paddle/phi/kernels/funcs/yolo_box_util.h      | 112 +++++++++++
 paddle/phi/kernels/gpu/yolo_box_kernel.cu     | 182 ++++++++++++++++++
 paddle/phi/kernels/yolo_box_kernel.h          |  36 ++++
 paddle/phi/ops/compat/yolo_box_sig.cc         |  35 ++++
 .../fluid/tests/unittests/test_yolo_box_op.py |   3 +-
 11 files changed, 496 insertions(+), 329 deletions(-)
 delete mode 100644 paddle/fluid/operators/detection/yolo_box_op.cu
 delete mode 100644 paddle/fluid/operators/detection/yolo_box_op.h
 create mode 100644 paddle/phi/kernels/cpu/yolo_box_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/yolo_box_util.h
 create mode 100644 paddle/phi/kernels/gpu/yolo_box_kernel.cu
 create mode 100644 paddle/phi/kernels/yolo_box_kernel.h
 create mode 100644 paddle/phi/ops/compat/yolo_box_sig.cc

diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index 57177cfa8b4..336005d883b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -16,7 +16,6 @@
 #include <cassert>
 
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1ebafa54598..568c7982cfc 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
+detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 511d8e0eed1..0d9fbf612f7 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,7 +9,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -240,8 +239,6 @@ REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
-                       ops::YoloBoxKernel<double>);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
deleted file mode 100644
index fb5c214a59e..00000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh,
-                            const int* anchors, const int n, const int h,
-                            const int w, const int an_num, const int class_num,
-                            const int box_num, int input_size_h,
-                            int input_size_w, bool clip_bbox, const float scale,
-                            const float bias, bool iou_aware,
-                            const float iou_aware_factor) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  T box[4];
-  for (; tid < n * box_num; tid += stride) {
-    int grid_num = h * w;
-    int i = tid / box_num;
-    int j = (tid % box_num) / grid_num;
-    int k = (tid % grid_num) / w;
-    int l = tid % w;
-
-    int an_stride = (5 + class_num) * grid_num;
-    int img_height = imgsize[2 * i];
-    int img_width = imgsize[2 * i + 1];
-
-    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
-                                iou_aware);
-    T conf = sigmoid<T>(input[obj_idx]);
-    if (iou_aware) {
-      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
-      T iou = sigmoid<T>(input[iou_idx]);
-      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-             pow(iou, static_cast<T>(iou_aware_factor));
-    }
-    if (conf < conf_thresh) {
-      continue;
-    }
-
-    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
-                                iou_aware);
-    GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
-                  input_size_w, box_idx, grid_num, img_height, img_width, scale,
-                  bias);
-    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
-
-    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
-                                  5, iou_aware);
-    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
-    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
-                      grid_num);
-  }
-}
-
-template <typename T>
-class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* img_size = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = sizeof(int) * anchors.size();
-    auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
-    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
-    const auto gplace = ctx.GetPlace();
-    const auto cplace = platform::CPUPlace();
-    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
-                 dev_ctx.stream());
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = img_size->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, boxes, static_cast<T>(0));
-    set_zero(dev_ctx, scores, static_cast<T>(0));
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
-
-    dim3 thread_num = config.thread_per_block;
-#ifdef WITH_NV_JETSON
-    if (config.compute_capability == 53 || config.compute_capability == 62) {
-      thread_num = 512;
-    }
-#endif
-
-    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
-                     ctx.cuda_device_context().stream()>>>(
-        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
-        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
-                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
deleted file mode 100644
index 2cd69c60b7c..00000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                  int j, int an_idx, int grid_size_h,
-                                  int grid_size_w, int input_size_h,
-                                  int input_size_w, int index, int stride,
-                                  int img_height, int img_width, float scale,
-                                  float bias) {
-  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
-  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
-           grid_size_h;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size_w;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size_h;
-}
-
-HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
-                                    int an_num, int an_stride, int stride,
-                                    int entry, bool iou_aware) {
-  if (iou_aware) {
-    return (batch * an_num + an_idx) * an_stride +
-           (batch * an_num + an_num + entry) * stride + hw_idx;
-  } else {
-    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-  }
-}
-
-HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
-                                  int an_stride, int stride) {
-  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
-         hw_idx;
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
-                                        const int img_height,
-                                        const int img_width, bool clip_bbox) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  if (clip_bbox) {
-    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-    boxes[box_idx + 1] =
-        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                             ? boxes[box_idx + 2]
-                             : static_cast<T>(img_width - 1);
-    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                             ? boxes[box_idx + 3]
-                             : static_cast<T>(img_height - 1);
-  }
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
-                                      const int label_idx, const int score_idx,
-                                      const int class_num, const T conf,
-                                      const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-class YoloBoxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* imgsize = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    Tensor anchors_;
-    auto anchors_data =
-        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
-    std::copy(anchors.begin(), anchors.end(), anchors_data);
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = imgsize->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    memset(boxes_data, 0, boxes->numel() * sizeof(T));
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    memset(scores_data, 0, scores->numel() * sizeof(T));
-
-    T box[4];
-    for (int i = 0; i < n; i++) {
-      int img_height = imgsize_data[2 * i];
-      int img_width = imgsize_data[2 * i + 1];
-
-      for (int j = 0; j < an_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 4, iou_aware);
-            T conf = sigmoid<T>(input_data[obj_idx]);
-            if (iou_aware) {
-              int iou_idx =
-                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
-              T iou = sigmoid<T>(input_data[iou_idx]);
-              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-                     pow(iou, static_cast<T>(iou_aware_factor));
-            }
-            if (conf < conf_thresh) {
-              continue;
-            }
-
-            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 0, iou_aware);
-            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
-                          input_size_h, input_size_w, box_idx, stride,
-                          img_height, img_width, scale, bias);
-            box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
-                                clip_bbox);
-
-            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                          stride, 5, iou_aware);
-            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
-            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
-                              class_num, conf, stride);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
new file mode 100644
index 00000000000..a83bc019fc3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  auto* imgsize = &img_size;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  const int stride = h * w;
+  const int an_stride = (class_num + 5) * stride;
+
+  DenseTensor anchors_;
+  auto anchors_data =
+      anchors_.mutable_data<int>({an_num * 2}, dev_ctx.GetPlace());
+  std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = imgsize->data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  memset(boxes_data, 0, boxes->numel() * sizeof(T));
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  memset(scores_data, 0, scores->numel() * sizeof(T));
+
+  T box[4];
+  for (int i = 0; i < n; i++) {
+    int img_height = imgsize_data[2 * i];
+    int img_width = imgsize_data[2 * i + 1];
+
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          int obj_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 4, iou_aware);
+          T conf = funcs::sigmoid<T>(input_data[obj_idx]);
+          if (iou_aware) {
+            int iou_idx =
+                funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+            T iou = funcs::sigmoid<T>(input_data[iou_idx]);
+            conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                   pow(iou, static_cast<T>(iou_aware_factor));
+          }
+          if (conf < conf_thresh) {
+            continue;
+          }
+
+          int box_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 0, iou_aware);
+          funcs::GetYoloBox<T>(box,
+                               input_data,
+                               anchors_data,
+                               l,
+                               k,
+                               j,
+                               h,
+                               w,
+                               input_size_h,
+                               input_size_w,
+                               box_idx,
+                               stride,
+                               img_height,
+                               img_width,
+                               scale,
+                               bias);
+          box_idx = (i * box_num + j * stride + k * w + l) * 4;
+          funcs::CalcDetectionBox<T>(
+              boxes_data, box, box_idx, img_height, img_width, clip_bbox);
+
+          int label_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 5, iou_aware);
+          int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+          funcs::CalcLabelScore<T>(scores_data,
+                                   input_data,
+                                   label_idx,
+                                   score_idx,
+                                   class_num,
+                                   conf,
+                                   stride);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, CPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/yolo_box_util.h b/paddle/phi/kernels/funcs/yolo_box_util.h
new file mode 100644
index 00000000000..337af2d7a23
--- /dev/null
+++ b/paddle/phi/kernels/funcs/yolo_box_util.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box,
+                                  const T* x,
+                                  const int* anchors,
+                                  int i,
+                                  int j,
+                                  int an_idx,
+                                  int grid_size_h,
+                                  int grid_size_w,
+                                  int input_size_h,
+                                  int input_size_w,
+                                  int index,
+                                  int stride,
+                                  int img_height,
+                                  int img_width,
+                                  float scale,
+                                  float bias) {
+  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
+  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
+           grid_size_h;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size_w;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size_h;
+}
+
+HOSTDEVICE inline int GetEntryIndex(int batch,
+                                    int an_idx,
+                                    int hw_idx,
+                                    int an_num,
+                                    int an_stride,
+                                    int stride,
+                                    int entry,
+                                    bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+HOSTDEVICE inline int GetIoUIndex(
+    int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes,
+                                        T* box,
+                                        const int box_idx,
+                                        const int img_height,
+                                        const int img_width,
+                                        bool clip_bbox) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+    boxes[box_idx + 1] =
+        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                             ? boxes[box_idx + 2]
+                             : static_cast<T>(img_width - 1);
+    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                             ? boxes[box_idx + 3]
+                             : static_cast<T>(img_height - 1);
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores,
+                                      const T* input,
+                                      const int label_idx,
+                                      const int score_idx,
+                                      const int class_num,
+                                      const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
new file mode 100644
index 00000000000..2719dcd9e54
--- /dev/null
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input,
+                            const int* imgsize,
+                            T* boxes,
+                            T* scores,
+                            const float conf_thresh,
+                            const int* anchors,
+                            const int n,
+                            const int h,
+                            const int w,
+                            const int an_num,
+                            const int class_num,
+                            const int box_num,
+                            int input_size_h,
+                            int input_size_w,
+                            bool clip_bbox,
+                            const float scale,
+                            const float bias,
+                            bool iou_aware,
+                            const float iou_aware_factor) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 4, iou_aware);
+    T conf = funcs::sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx =
+          funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = funcs::sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 0, iou_aware);
+    funcs::GetYoloBox<T>(box,
+                         input,
+                         anchors,
+                         l,
+                         k,
+                         j,
+                         h,
+                         w,
+                         input_size_h,
+                         input_size_w,
+                         box_idx,
+                         grid_num,
+                         img_height,
+                         img_width,
+                         scale,
+                         bias);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    funcs::CalcDetectionBox<T>(
+        boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 5, iou_aware);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    funcs::CalcLabelScore<T>(
+        scores, input, label_idx, score_idx, class_num, conf, grid_num);
+  }
+}
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  int bytes = sizeof(int) * anchors.size();
+  auto anchors_ptr =
+      paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
+  int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+  const auto gplace = dev_ctx.GetPlace();
+  const auto cplace = phi::CPUPlace();
+  paddle::memory::Copy(
+      gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = img_size.data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+  set_zero(dev_ctx, boxes, static_cast<T>(0));
+  set_zero(dev_ctx, scores, static_cast<T>(0));
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * box_num);
+
+  dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+  if (config.compute_capability == 53 || config.compute_capability == 62) {
+    thread_num = 512;
+  }
+#endif
+
+  KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
+      input_data,
+      imgsize_data,
+      boxes_data,
+      scores_data,
+      conf_thresh,
+      anchors_data,
+      n,
+      h,
+      w,
+      an_num,
+      class_num,
+      box_num,
+      input_size_h,
+      input_size_w,
+      clip_bbox,
+      scale,
+      bias,
+      iou_aware,
+      iou_aware_factor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, GPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/yolo_box_kernel.h b/paddle/phi/kernels/yolo_box_kernel.h
new file mode 100644
index 00000000000..9553d300cad
--- /dev/null
+++ b/paddle/phi/kernels/yolo_box_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/yolo_box_sig.cc b/paddle/phi/ops/compat/yolo_box_sig.cc
new file mode 100644
index 00000000000..bb39e72a64f
--- /dev/null
+++ b/paddle/phi/ops/compat/yolo_box_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("yolo_box",
+                         {"X", "ImgSize"},
+                         {"anchors",
+                          "class_num",
+                          "conf_thresh",
+                          "downsample_ratio",
+                          "clip_bbox",
+                          "scale_x_y",
+                          "iou_aware",
+                          "iou_aware_factor"},
+                         {"Boxes", "Scores"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 043c5c1651a..f210d97362c 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -260,5 +260,6 @@ class TestYoloBoxOpHW(TestYoloBoxOp):
         self.iou_aware_factor = 0.5
 
 
-if (__name__ == '__main__'):
+if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From bcaf88d20db551f53fbe7672000cbc4053fcae03 Mon Sep 17 00:00:00 2001
From: wangguanqun <esythan@126.com>
Date: Sat, 5 Mar 2022 00:11:53 +0800
Subject: [PATCH 136/272] Ps optimizer multi programs (#39883)

* fix benchmark and communicator config

* fix bugs of the_one_ps

* multi program and fix bug in optimizer

* multi program in the_one_ps

* public commcontext

* ps optimizer multi programs

* the one ps merge

* fix bug in test
---
 python/paddle/distributed/collective.py       |  2 +-
 .../fleet/meta_optimizers/ps_optimizer.py     | 43 +++++++++++++---
 .../distributed/passes/ps_trainer_pass.py     |  5 ++
 python/paddle/distributed/ps/the_one_ps.py    | 51 ++++++++++---------
 .../ps/utils/ps_program_builder.py            | 12 +++--
 python/paddle/distributed/ps/utils/public.py  |  5 +-
 .../test_ps_trainer_pass.py                   |  4 ++
 .../tests/unittests/ps/ps_dnn_trainer.py      |  1 +
 8 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 3731332d1e7..8042aced6bb 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -29,7 +29,6 @@ from ..fluid.layers import utils
 from ..fluid.dygraph import layers
 from ..fluid.dygraph.parallel import prepare_context
 import paddle
-from .fleet import fleet
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle import _C_ops
@@ -1422,6 +1421,7 @@ def split(x,
             "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
             "ParallelColumnLinear instead.")
     else:
+        from .fleet import fleet
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
         rank = fleet.worker_index()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 00937dbe7a4..f786f665ad4 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -31,14 +31,19 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
-        self.pass_ctx = PassContext()
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(ParameterServerOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _set_origin_programs(self, losses):
+        self.origin_main_programs = []
+        for loss in losses:
+            self.origin_main_programs.append(loss.block.program)
+
     def _init_ps_pass_context(self, loss, startup_program):
+        self.pass_ctx = PassContext()
         attrs = {}
         # trainer
         attrs["env"] = get_dist_env()
@@ -46,9 +51,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         attrs['loss'] = loss
         attrs['min_block_size'] = 81920
         attrs['origin_main_program'] = loss.block.program
-        attrs['origin_main_programs'] = [loss.block.program]
         attrs['origin_startup_program'] = startup_program
-        attrs['origin_startup_programs'] = [startup_program]
+
+        attrs['origin_main_programs'] = self.origin_main_programs
 
         attrs['cloned_main'] = attrs['origin_main_program'].clone()
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
@@ -90,10 +95,11 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return False
 
     def _can_apply(self):
-        if self._attrs['role_maker']._is_collective or self._attrs[
-                'k_steps'] < 0:
+        if self.role_maker._is_collective:
             return False
-        return True
+
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        return True if k_steps >= 0 else False
 
     def minimize_impl(self,
                       loss,
@@ -104,12 +110,37 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                                 no_grad_set)
         if startup_program == None:
             startup_program = paddle.static.default_startup_program()
+        print("program after inner optimizer minimize:",
+              str(loss.block.program))
+        self._set_origin_programs([loss])
         self._init_ps_pass_context(loss, startup_program)
         ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
             self.pass_ctx)
         ps_builder._build_programs()
         return None, None
 
+    def minimize_losses_impl(self,
+                             losses,
+                             startup_program=None,
+                             parameter_list=None,
+                             no_grad_set=None):
+        if parameter_list is None:
+            parameter_list = [None] * len(losses)
+        for idx, loss in enumerate(losses):
+            startup_prog = startup_program[idx]
+            parameters = parameter_list[idx]
+            self.inner_opt.minimize(loss, startup_prog, parameters, no_grad_set)
+        self._set_origin_programs(losses)
+        for idx, loss in enumerate(losses):
+            print("ps_optimizer idx loss:", idx, loss)
+            startup_prog = startup_program[idx]
+            self._init_ps_pass_context(loss, startup_prog)
+            ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
+                self.pass_ctx)
+            ps_builder._build_programs()
+            startup_program[idx] = self.pass_ctx._attrs['cloned_startup']
+        return None, None
+
     def _can_apply_geo(self, program):
         def get_sys_free_mem():
             plat = platform.system()
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 284365ce066..6f72cf1b159 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -74,6 +74,8 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
+        print("pass loss program id:", id(attrs['loss'].block.program))
+        print("pass main program id:", id(main_program))
         ps_mode = attrs['ps_mode']
         if ps_mode == DistributedMode.GEO:
             send_ctx = get_geo_trainer_send_context(attrs)  # geo 模式
@@ -84,6 +86,8 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
         for merged_name, send in send_ctx.items():
             if send.is_sparse() and ps_mode != DistributedMode.GEO:
                 continue
+            if send.program_id() != id(attrs['loss'].block.program):
+                continue
             logger.info('merged_name, send: {}, {}'.format(merged_name, send))
             is_sparse = 1 if send.is_sparse() else 0
             is_sparse = 2 if send.is_distributed() else is_sparse
@@ -496,6 +500,7 @@ class DeleteOptimizesPass(PassBase):
             persistable=True)
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
+        print("delete_optimizer_pass")
         attrs = pass_ctx._attrs
         optimizer_ops = get_optimize_ops(main_program)
         lr_ops = get_lr_ops(main_program)
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index cc744bc9d9e..5170684b432 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -40,12 +40,12 @@ def get_program_by_id(context, program_id):
     programs = context["origin_main_programs"]
     for i, program in enumerate(programs):
         if id(program) == program_id:
-            return program, context["origin_startup_programs"][i]
-    return None, None
+            return program, context["origin_startup_programs"][i], i
+    return None, None, None
 
 
 def parse_table_class(varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     for op in main_program.global_block().ops:
         if not is_distributed_sparse_op(op) and not is_sparse_op(op):
             continue
@@ -60,7 +60,7 @@ def parse_table_class(varname, program_id, context):
 
 
 def check_embedding_dim(accessor_proto, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     embedding_dim = 0
     for var in main_program.list_vars():
         if var.name == varname:
@@ -94,10 +94,9 @@ class Service:
 
 class GpuService(Service):
     def __init__(self):
-        super(GpuService).__init__(self)
+        super(GpuService, self).__init__()
 
     def _set(self, service_proto):
-        super(GpuService)._set(service_proto)
         service_proto.server_class = 'PsLocalServer'
         service_proto.client_class = 'PsLocalClient'
 
@@ -111,7 +110,8 @@ class Accessor:
 
     # TableAccessorParameter accessor
     def _set(self, accessor_proto, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         embedding_dim = 0
         for var in main_program.list_vars():
             if var.name == varname:
@@ -236,7 +236,8 @@ class CommonAccessor(Accessor):
         self.opt_init_map = opt_init_map
 
     def parse_entry(self, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
@@ -290,8 +291,8 @@ class CommonAccessor(Accessor):
         print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
             ctx.table_id(), ctx.is_datanorm_table()))
 
-        main_program, startup_program = get_program_by_id(context,
-                                                          ctx.program_id())
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
@@ -359,10 +360,11 @@ class CommonAccessor(Accessor):
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
                     #TODO: for dense learning_rate, can be different from sparse lr
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     initializer = self.get_initializer_attr(param.name,
                                                             startup_program)
@@ -404,10 +406,11 @@ class CommonAccessor(Accessor):
                 else:
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     if shape is None:
                         if is_sparse:
@@ -707,6 +710,7 @@ class PsDescBuilder(object):
         self.ps_mode = context['ps_mode']
         self.is_heter_ps_mode = context['is_heter_ps_mode']
         self.use_ps_gpu = context['use_ps_gpu']
+        self.barrier_table_id = None
         self.send_ctx = get_the_one_send_context(
             self.context,
             use_origin_program=True,
@@ -767,6 +771,8 @@ class PsDescBuilder(object):
             table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
             )
             table._set(table_proto)
+            if type(table) == BarrierTable and self.barrier_table_id is None:
+                self.barrier_table_id = table.idx
         self.service._set(
             self.ps_desc.server_param.downpour_server_param.service_param)
         return text_format.MessageToString(self.ps_desc)
@@ -820,9 +826,9 @@ class TheOnePSRuntime(RuntimeBase):
         self.context['tensor_table'] = {}
         build_var_distributed(self.context)
 
-        endpoints = get_ps_endpoints(self.role_maker)
+        self.endpoints = get_ps_endpoints(self.role_maker)
         self.string_hosts = []
-        for idx, ep in enumerate(endpoints):
+        for idx, ep in enumerate(self.endpoints):
             host, port = ep.split(":")
             pshost = fluid.core.PSHost(host, int(port), idx)
             self.string_hosts.append(pshost.serialize_to_string())
@@ -848,7 +854,7 @@ class TheOnePSRuntime(RuntimeBase):
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = worker_desc + "\n" + server_desc
+        proto_txt = worker_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("worker: \n{}".format(proto_txt))
@@ -859,7 +865,7 @@ class TheOnePSRuntime(RuntimeBase):
             self.context,
             split_dense_table=self.is_heter_ps_mode,
             use_origin_program=self.is_heter_ps_mode,
-            ep_list=endpoints)
+            ep_list=self.endpoints)
         trainer_config = self.context['trainer']
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
@@ -876,10 +882,7 @@ class TheOnePSRuntime(RuntimeBase):
         kwargs["trainer_id"] = self.role_maker._role_id()
         kwargs["trainers"] = self.role_maker._worker_num()
 
-        for table in server.servers[0].tables:  #TODO
-            if table.table_class == "BarrierTable":
-                kwargs["barrier_table_id"] = table.id
-                break
+        kwargs["barrier_table_id"] = self.ps_desc_builder.barrier_table_id
 
         if self.context['ps_mode'] == DistributedMode.SYNC:
             sync_kwargs = sync_strategy_envs()
@@ -1009,7 +1012,7 @@ class TheOnePSRuntime(RuntimeBase):
             if origin_varname.endswith("@GRAD"):
                 return False
 
-            if origin_varname == "learning_rate_0":
+            if origin_varname.startswith("learning_rate_"):
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
@@ -1113,7 +1116,7 @@ class TheOnePSRuntime(RuntimeBase):
                 "in fleet.save() function, executor must be as Executor type")
 
         if main_program is None:
-            main_program = self.context['origin_ps_main_program']
+            main_program = self.context['origin_main_program']
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index d737542f323..ff99f9d071e 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -88,7 +88,7 @@ class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
         self.attrs['origin_main_program'] = self.cloned_main
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -103,10 +103,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
                              format(self.ps_mode, "PsProgramBuilder"))
 
     def _build_trainer_programs(self):
+        print("build trainer program entry")
+        print("before ps program builder program:", self.cloned_main)
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                            self.attrs)
         add_lr_decay_table_pass.apply([], [], self.pass_ctx)
 
+        print("before distributed op pass")
         distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
         distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
 
@@ -126,9 +129,10 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
 
         self.attrs['origin_main_program'] = self.cloned_main
         self.attrs['origin_startup_program'] = self.cloned_startup
+        print("after ps program builder program:", self.cloned_main)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -167,7 +171,7 @@ class GpuPsProgramBuilder(PsProgramBuilder):
         self.attrs['origin_startup_program'] = self.cloned_startup
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -220,7 +224,7 @@ class HeterAsyncPsProgramBuilder(PsProgramBuilder):
                                           [self.cloned_startup], self.pass_ctx)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ab5bd7da09d..7839c8520c6 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -450,9 +450,8 @@ def get_the_one_send_context(context,
     idx = 0
     for i, program in enumerate(origin_programs):
         merged_dense_pairs = context['merged_dense_pairs'][i]
-        idx += get_dense_send_context(program, send_ctx, idx,
-                                      merged_dense_pairs, trainer_id,
-                                      split_dense_table)
+        idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
+                                     trainer_id, split_dense_table)
     distibuted_varnames = get_sparse_tablenames(origin_programs, True)
     print("public distibuted_varnames:", distibuted_varnames)
     for i, program in enumerate(origin_programs):
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index fd558ef0403..877136cf6ed 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -146,9 +146,13 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_old_minimize"
+        remove_path_if_exists(self.config['log_dir'])
         self.ps_launch("gpu-ps")
 
         self.config['debug_new_minimize'] = '1'
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_new_minimize"
+        remove_path_if_exists(self.config['log_dir'])
         self.ps_launch("gpu-ps")
 
         file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index bc87fc255a5..0fd64b0d923 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -382,6 +382,7 @@ class DnnTrainer(object):
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
             ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
                                          user_defined_strategy)
+            ps_optimizer._set_origin_programs([loss])
             ps_optimizer._init_ps_pass_context(loss, startup_program)
             _main = ps_optimizer.pass_ctx._attrs['cloned_main']
 
-- 
GitLab


From 86eafde9d3c9b9e95f9f7b3594a86217aa2c1a55 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 5 Mar 2022 08:34:37 +0800
Subject: [PATCH 137/272] fix test jit save load failed (#40180)

---
 python/paddle/fluid/dygraph/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index aad77373509..f58952d3036 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -885,7 +885,7 @@ def _run_dygraph(instance, input, program_holder):
             'start_op_index': 0,
             'end_op_index': end_op_index,
             'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program)
+            'program_id': _hash_with_id(trace_program, instance)
         })
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
-- 
GitLab


From 94f03dc24581ee038d84c53209ff2a67e808d407 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 5 Mar 2022 13:25:25 +0800
Subject: [PATCH 138/272] support add infershape for no grad op (#40182)

---
 paddle/fluid/framework/op_registry.h | 4 ++--
 paddle/fluid/operators/empty_op.cc   | 9 +++------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index c45bf32d8b7..eb40a49b406 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
     return 0;                                                            \
   }
 
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class, \
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
+  REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
         paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
         paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 6baa504562e..96fa3282d06 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -90,9 +90,6 @@ namespace plat = paddle::platform;
 
 DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
                             PT_INFER_META(phi::CreateInferMeta));
-
-REGISTER_OPERATOR(
-    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    EmptyInferShapeFunctor);
+REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker,
+                             ops::EmptyOpVarTypeInference,
+                             EmptyInferShapeFunctor);
-- 
GitLab


From 4be5448b2eba3f49cd964339060f085fc267fe3b Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Sat, 5 Mar 2022 17:31:52 +0800
Subject: [PATCH 139/272] [Phi] move infershape for mv (#39954)

* [Phi] move infershape for mv

* [Phi] delete extra codes for mv
---
 paddle/fluid/operators/mv_op.cc | 36 ++++++++-------------------------
 paddle/phi/infermeta/binary.cc  | 30 +++++++++++++++++++++++++++
 paddle/phi/infermeta/binary.h   |  3 +++
 paddle/phi/ops/compat/mv_sig.cc |  5 -----
 4 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index ab9f10070fc..d34a1ebf82c 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
-    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
-
-    auto dim_x = context->GetInputDim("X");
-    auto dim_vec = context->GetInputDim("Vec");
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of input X should be 2, but is %d", dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_vec.size(), 1,
-        platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
-                      platform::errors::InvalidArgument(
-                          "X's second dimension is expected to be equal to "
-                          "Vec's first dimension"
-                          "but recieved X'shape = [%s], Vec's shape = [%s]",
-                          dim_x, dim_vec));
-
-    framework::DDim dim_out = phi::make_ddim({dim_x[0]});
-
-    context->SetOutputDim("Out", dim_out);
-    context->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 template <typename T>
@@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor,
+                            PT_INFER_META(phi::MvInferMeta));
+
 REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>,
+                  MvInferShapeFunctor);
 REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 745ddffabbe..03128e96a83 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -443,4 +443,34 @@ void GatherTreeMeta(const MetaTensor& ids,
   out->set_dims(ids_dims);
 }
 
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_vec = vec.dims();
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      2,
+      phi::errors::InvalidArgument("The rank of input X should be 2, but is %d",
+                                   dim_x.size()));
+  PADDLE_ENFORCE_EQ(
+      dim_vec.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+  PADDLE_ENFORCE_EQ(dim_x[1],
+                    dim_vec[0],
+                    phi::errors::InvalidArgument(
+                        "X's second dimension is expected to be equal to "
+                        "Vec's first dimension"
+                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        dim_x,
+                        dim_vec));
+
+  auto dim_out = phi::make_ddim({dim_x[0]});
+
+  out->set_dims(dim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 2ec74463698..f397c0def8a 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -85,4 +85,7 @@ void GatherNdInferMeta(const MetaTensor& x,
 void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out);
+
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc
index ab0d31ee31d..0012f8e1ccb 100644
--- a/paddle/phi/ops/compat/mv_sig.cc
+++ b/paddle/phi/ops/compat/mv_sig.cc
@@ -16,10 +16,6 @@
 
 namespace phi {
 
-KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"});
-}
-
 KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("mv_grad",
                          {"X", "Vec", GradVarName("Out")},
@@ -29,5 +25,4 @@ KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping);
-- 
GitLab


From e7afa3917799b67e44044c95c10603bf626133cf Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 5 Mar 2022 19:02:26 +0800
Subject: [PATCH 140/272] [Phi] Remove eig op depend for svd_helper (#40174)

* remove eig dep for svd helper

* fix win failed
---
 paddle/fluid/operators/eig_op.h         |  92 ++++++++++-------
 paddle/phi/kernels/complex_kernel.h     |  60 ++++++++++-
 paddle/phi/kernels/funcs/diag_functor.h |  99 ++++++++++++++++++
 paddle/phi/kernels/funcs/slice.h        | 127 ++++++++++++++++++++++++
 paddle/phi/kernels/funcs/unsqueeze.h    |  41 ++++++++
 paddle/phi/kernels/matmul_kernel.h      |   4 +-
 6 files changed, 379 insertions(+), 44 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/slice.h
 create mode 100644 paddle/phi/kernels/funcs/unsqueeze.h

diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index e9c6c1eb7ec..5e4c83e1a45 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -18,12 +18,19 @@
 #include <algorithm>
 #include <complex>
 #include "paddle/fluid/operators/math/matrix_solve.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
 #define EPSILON 1e-6
 
 namespace paddle {
@@ -214,12 +221,17 @@ class EigKernel : public framework::OpKernel<T> {
 
       ApplyEigKernel<DeviceContext, phi::dtype::Real<T>>(
           *x, &real_values, &real_vectors, context);
-      auto dito = math::DeviceIndependenceTensorOperations<
-          DeviceContext, phi::dtype::Real<T>, Tout>(context);
+
+      auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+      auto& dev_ctx = static_cast<
+          const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+          orig_dev_ctx);
 
       // 1. extract real part & imag part from real_values
-      Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
-      Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
+      Tensor real_part =
+          phi::funcs::Slice<T>(dev_ctx, real_values, {-1}, {0}, {order});
+      Tensor imag_part = phi::funcs::Slice<T>(dev_ctx, real_values, {-1},
+                                              {order}, {order * 2});
 
       // 2. construct complex values
       auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
@@ -233,7 +245,8 @@ class EigKernel : public framework::OpKernel<T> {
       for_range(functor);
 
       // 3. construct complex vectors
-      Tensor real_vector_trans = dito.Transpose(real_vectors);
+      Tensor real_vector_trans =
+          phi::TransposeLast2Dim<T>(dev_ctx, real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
       ConstructComplexVectors<phi::dtype::Real<T>, Tout>(
@@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename Tout>
+template <typename DeviceContext, typename T>
 void ComputeBackwardForComplexInput(
     const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV,
-    Tout* x_grad_data, int batch_count, int order,
+    T* x_grad_data, int batch_count, int order,
     const framework::ExecutionContext& context) {
-  auto dito =
-      math::DeviceIndependenceTensorOperations<DeviceContext, Tout, Tout>(
-          context);
-
-  Tensor trans_v = dito.Transpose(V);
-  Tensor Vh = dito.Conj(trans_v);
-  Tensor Lconj = dito.Conj(L);
-  Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1));
-  Tensor VhgV = dito.Matmul(Vh, gV);
-  Tensor diag_real = dito.Real(VhgV);
-  Tensor diag_res = dito.BatchDiag(diag_real, batch_count);
-  Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2);
+  auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+  auto& dev_ctx = static_cast<
+      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+      orig_dev_ctx);
+
+  Tensor trans_v = phi::TransposeLast2Dim<T>(dev_ctx, V);
+  Tensor Vh = phi::Conj<T>(dev_ctx, trans_v);
+  Tensor Lconj = phi::Conj<T>(dev_ctx, L);
+  Tensor Econj = phi::Subtract<T>(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2),
+                                  phi::funcs::Unsqueeze(Lconj, -1));
+  Tensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV);
+  Tensor diag_real = phi::Real<T>(dev_ctx, VhgV);
+  Tensor diag_res = phi::funcs::BatchDiag<T>(dev_ctx, diag_real, batch_count);
+  Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
 
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<Tout>>();
-  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
+  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<T>>();
+  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<T>(
       diag_unsqueezed.dims(), context.GetPlace(),
-      static_cast<size_t>(numel * sizeof(Tout)));
-  auto& dev_ctx = context.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  phi::funcs::RealToComplexFunctor<Tout> functor(data_diag_un, data_diag_un_com,
-                                                 numel);
+      static_cast<size_t>(numel * sizeof(T)));
+
+  platform::ForRange<DeviceContext> for_range(orig_dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(data_diag_un, data_diag_un_com,
+                                              numel);
   for_range(functor);
   // real tensor multiply complex tensor in broadcast manner
-  Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex);
-  Tensor res2 = dito.Matmul(Vh, res1);
-  Tensor result = dito.Sub(VhgV, res2);
+  Tensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
+  Tensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
+  Tensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
 
-  result.mutable_data<Tout>(V.dims(), context.GetPlace());
-  result = dito.Div(result, Econj);
-  result = dito.DiagFill(order, order, order, 0, gL, result);
-  Tensor rhs = dito.Matmul(result, Vh);
+  result.mutable_data<T>(V.dims(), context.GetPlace());
+  result = phi::Divide<T>(dev_ctx, result, Econj);
+  result =
+      phi::funcs::DiagFill<T, T>(dev_ctx, order, order, order, 0, gL, result);
+  Tensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
 
   // solve linear system
   // solve(Vh, rhs, out, m, k)
@@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput(
   // x_grad: out
   int m = Vh.dims()[Vh.dims().size() - 1];
   int k = rhs.dims()[rhs.dims().size() - 1];
-  auto* matrix_data = Vh.data<Tout>();
-  auto* rhs_data = rhs.data<Tout>();
-  math::SolveLinearSystem<Tout>(matrix_data, rhs_data, x_grad_data, m, k,
-                                batch_count);
+  auto* matrix_data = Vh.data<T>();
+  auto* rhs_data = rhs.data<T>();
+  math::SolveLinearSystem<T>(matrix_data, rhs_data, x_grad_data, m, k,
+                             batch_count);
 }
 
 template <typename DeviceContext, typename T, typename Tout>
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 3b3003392d3..2c52001ece1 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -24,6 +24,12 @@ namespace phi {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
 // If T is complex
 template <
     typename T,
@@ -50,10 +56,56 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename Context>
-void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  RealKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
 
-template <typename T, typename Context>
-void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  ImagKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
index a806d1583a0..1862f5ec91b 100644
--- a/paddle/phi/kernels/funcs/diag_functor.h
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -14,6 +14,14 @@
 
 #pragma once
 
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
 namespace phi {
 namespace funcs {
 
@@ -25,5 +33,96 @@ inline int ComputeStride(int axis, phi::DDim dims) {
   return size;
 }
 
+template <typename T, typename ValueType>
+struct DiagAndFillFunctor {
+  DiagAndFillFunctor(const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const ValueType* scale,
+                     const T* input,
+                     T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = input_[index];
+    } else if (col == band_end - 1) {
+      output_[index] = static_cast<T>(scale_[index % m_]);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+ private:
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const ValueType* scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename ValueType, typename Context>
+DenseTensor DiagFill(const Context& dev_ctx,
+                     const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const DenseTensor& scale,
+                     const DenseTensor& input) {
+  DenseTensor out;
+  out.Resize(input.dims());
+  dev_ctx.template Alloc<T>(&out);
+  funcs::ForRange<Context> for_range(dev_ctx, input.numel());
+  DiagAndFillFunctor<T, ValueType> diag_and_copy_functor(
+      m,
+      n,
+      num_lower_diags,
+      num_upper_diags,
+      scale.data<ValueType>(),
+      input.data<T>(),
+      out.data<T>());
+  for_range(diag_and_copy_functor);
+  return out;
+}
+
+template <typename T, typename Context>
+DenseTensor BatchDiag(const Context& dev_ctx, const DenseTensor& x, int batch) {
+  DenseTensor out;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  auto numel = x.numel();
+  out.Resize(x.dims());
+  auto* out_data = dev_ctx.template HostAlloc<phi::dtype::Real<T>>(
+      &out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
+
+  auto x_dims = x.dims();
+  int num_dims = x_dims.size();
+  std::vector<int> out_shape;
+
+  for (int i = 0; i < num_dims - 1; ++i) {
+    out_shape.push_back(x.dims()[i]);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  int order = x.dims()[num_dims - 1];
+  int stride_out = order * order;
+  int stride_in = order + 1;
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < order; ++j) {
+      out_data[i * order + j] = x_data[stride_out * i + stride_in * j];
+    }
+  }
+  return out;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
new file mode 100644
index 00000000000..0a50dceb0a0
--- /dev/null
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T, size_t D>
+void EigenSliceWrapper(const Context& dev_ctx,
+                       const DenseTensor* in,
+                       const std::vector<int>& start,
+                       const std::vector<int>& end,
+                       DenseTensor* out) {
+  // Slice by call Eigen Tensor Function `.slice()`
+  size_t rank = in->dims().size();
+  PADDLE_ENFORCE_EQ(start.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function start "
+                        "argument must have the same length as input rank."));
+  PADDLE_ENFORCE_EQ(end.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function end "
+                        "argument must have the same length as input rank."));
+  auto eigen_place_ptr = dev_ctx.eigen_device();
+  auto eigen_place = *eigen_place_ptr;
+  auto out_t = phi::EigenTensor<T, D>::From(*out, out->dims());
+  auto in_t = phi::EigenTensor<T, D>::From(*in, in->dims());
+  Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+  for (size_t i = 0; i < D; i++) {
+    offsets_32bit[i] = start[i];
+    extents_32bit[i] = end[i];
+  }
+  EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+      eigen_place,
+      phi::To32BitIndex(out_t),
+      phi::To32BitIndex(in_t),
+      offsets_32bit,
+      extents_32bit);
+}
+
+#define SLICE_RANK_CASE(N)                                                \
+  case N: {                                                               \
+    EigenSliceWrapper<Context, T, N>(dev_ctx, &x, offset, extends, &ret); \
+    break;                                                                \
+  }
+
+template <typename T, typename Context>
+DenseTensor Slice(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  std::vector<int> axes,
+                  std::vector<int> starts,
+                  std::vector<int> ends) {
+  DenseTensor ret;
+  std::vector<int> new_axes = axes;
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  size_t rank = out_shape.size();
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  PADDLE_ENFORCE_EQ(
+      ends.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  for (unsigned int i = 0; i < axes.size(); ++i) {
+    int axis = axes[i];
+    if (axis < 0) axis = rank + axis;
+    new_axes[i] = axis;  // change negative to positive
+    int st = starts[i];
+    int ed = ends[i];
+    PADDLE_ENFORCE_GT(
+        ed,
+        st,
+        errors::InvalidArgument("C++ Slice Operation Not Support End < Start"));
+    out_shape[axis] = ed - st;
+  }
+  std::vector<int> offset(rank), extends(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    offset[i] = 0;
+    extends[i] = x.dims()[i];
+  }
+  for (size_t i = 0; i < new_axes.size(); ++i) {
+    offset[new_axes[i]] = starts[i];
+    extends[new_axes[i]] = ends[i] - starts[i];
+  }
+  ret.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&ret);
+  switch (rank) {
+    SLICE_RANK_CASE(1);
+    SLICE_RANK_CASE(2);
+    SLICE_RANK_CASE(3);
+    SLICE_RANK_CASE(4);
+    SLICE_RANK_CASE(5);
+    SLICE_RANK_CASE(6);
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Invalid Rank number, "
+                                  "currently only support rank between 2~6"));
+    }
+  }
+  return ret;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
new file mode 100644
index 00000000000..7b8a81471ef
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
+  // don't copy data, only change the dims
+  DenseTensor out(x);
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  if (axis >= 0) {
+    auto index = (out_shape.begin() + axis);
+    out_shape.insert(index, 1);
+  } else if (axis < 0) {
+    auto index = (out_shape.end() + axis + 1);
+    out_shape.insert(index, 1);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  return out;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index 8fc060d2e3d..1f1cb22c271 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -33,8 +33,8 @@ template <typename T, typename Context>
 DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
-                   bool transpose_x,
-                   bool transpose_y) {
+                   bool transpose_x = false,
+                   bool transpose_y = false) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
   MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out);
-- 
GitLab


From a3f28a31839b00b7b822ec90cdcce88687ed8fac Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Sun, 6 Mar 2022 00:33:44 +0800
Subject: [PATCH 141/272] =?UTF-8?q?=E3=80=90Phi=E3=80=91Migrate=20triangul?=
 =?UTF-8?q?ar=5Fsolve=20op=20into=20phi=20(#40093)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Migrate triangular_solve op into phi

* fix CI

* move MatrixReduceSum to phi funcs

* move MatrixReduceSum to phi funcs

* fix comment

* fic CI
---
 paddle/fluid/operators/triangular_solve_op.cc |  72 ++-------
 paddle/fluid/operators/triangular_solve_op.cu |  65 --------
 paddle/fluid/operators/triangular_solve_op.h  | 139 +-----------------
 paddle/phi/infermeta/binary.cc                |  59 ++++++++
 paddle/phi/infermeta/binary.h                 |   7 +
 paddle/phi/kernels/CMakeLists.txt             |   3 +-
 .../cpu/triangular_solve_grad_kernel.cc       |  23 +++
 .../kernels/cpu/triangular_solve_kernel.cc    |  84 +++++++++++
 paddle/phi/kernels/funcs/CMakeLists.txt       |   1 +
 paddle/phi/kernels/funcs/common_shape.h       |  66 +++++++++
 paddle/phi/kernels/funcs/matrix_reduce.cc     |  59 ++++++++
 paddle/phi/kernels/funcs/matrix_reduce.cu     |  62 ++++++++
 paddle/phi/kernels/funcs/matrix_reduce.h      |  34 +++++
 .../gpu/triangular_solve_grad_kernel.cu       |  23 +++
 .../kernels/gpu/triangular_solve_kernel.cu    | 132 +++++++++++++++++
 .../impl/triangular_solve_grad_kernel_impl.h  | 138 +++++++++++++++++
 .../kernels/triangular_solve_grad_kernel.h    |  36 +++++
 paddle/phi/kernels/triangular_solve_kernel.h  |  30 ++++
 paddle/phi/ops/compat/triangular_solve_sig.cc |  30 ++++
 19 files changed, 802 insertions(+), 261 deletions(-)
 delete mode 100644 paddle/fluid/operators/triangular_solve_op.cu
 create mode 100644 paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/triangular_solve_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/matrix_reduce.cc
 create mode 100644 paddle/phi/kernels/funcs/matrix_reduce.cu
 create mode 100644 paddle/phi/kernels/funcs/matrix_reduce.h
 create mode 100644 paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/triangular_solve_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/triangular_solve_grad_kernel.h
 create mode 100644 paddle/phi/kernels/triangular_solve_kernel.h
 create mode 100644 paddle/phi/ops/compat/triangular_solve_sig.cc

diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index 9233917b093..179f818104c 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/solve_op.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,58 +25,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto x_dims_n = x_dims.size();
-    auto y_dims_n = y_dims.size();
-
-    PADDLE_ENFORCE_GE(
-        x_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor X's dimensions of TriangularSolveOp "
-                         "should be >= 2. But received X's "
-                         "dimensions = %d, X's shape = [%s]",
-                         x_dims.size(), x_dims));
-
-    PADDLE_ENFORCE_GE(
-        y_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor Y's dimensions of TriangularSolveOp "
-                         "should be >=2. But received Y's "
-                         "dimensions = %d, Y's shape = [%s]",
-                         y_dims.size(), y_dims));
-
-    PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
-
-    std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-    std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
-
-    std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
-                                        x_dims_vec.end() - 2);
-    std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
-                                        y_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
-
-    std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
-    y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2],
-                                                     y_dims_vec[y_dims_n - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
     return framework::OpKernelType(
@@ -168,20 +119,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DELCARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor,
+                            PT_INFER_META(phi::TriangularSolveInferMeta));
+
 REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp,
                   ops::TriangularSolveOpMaker,
                   ops::TriangularSolveOpInferVarType,
                   ops::TriangularSolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>,
+                  TriangularSolveInferShapeFunctor);
 
 REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
deleted file mode 100644
index 7df98517e84..00000000000
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
index 4e68add096f..315847b4d80 100644
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ b/paddle/fluid/operators/triangular_solve_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
 namespace paddle {
@@ -30,10 +29,10 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
-static void triangular_solve(const DeviceContext& context, const Tensor& x,
-                             const Tensor& y, Tensor* out, bool upper,
+static void triangular_solve(const DeviceContext &context, const Tensor &x,
+                             const Tensor &y, Tensor *out, bool upper,
                              bool transpose, bool unitriangular) {
-  // Tensor broadcast use eigen
+  // Tensor broadcast use eigen library
   std::vector<int64_t> x_bst_dims_vec;
   std::vector<int64_t> y_bst_dims_vec;
   std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
@@ -64,15 +63,15 @@ static void triangular_solve(const DeviceContext& context, const Tensor& x,
 template <typename DeviceContext, typename T>
 class MatrixReduceSumFunctor {
  public:
-  void operator()(const Tensor& input, Tensor* output,
-                  const framework::ExecutionContext& ctx);
+  void operator()(const Tensor &input, Tensor *output,
+                  const framework::ExecutionContext &ctx);
 };
 
 template <typename T>
 class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
+  void operator()(const Tensor &in, Tensor *out,
+                  const framework::ExecutionContext &ctx) {
     // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
     // out_reduce_dim should be [0, 2]
     const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
@@ -101,129 +100,5 @@ class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class TriangularSolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    triangular_solve<DeviceContext, T>(dev_ctx, *x, *y, out, upper, transpose,
-                                       unitriangular);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TriangularSolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    const auto* out = ctx.Input<framework::Tensor>("Out");
-    const auto* dout =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    std::vector<int64_t> x_bst_dims_vec;
-    std::vector<int64_t> y_bst_dims_vec;
-    std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y);
-
-    Tensor dy_bst(y->type());
-    if (dy) {
-      dy->mutable_data<T>(y->dims(), dev_ctx.GetPlace());
-      dy_bst.Resize(phi::make_ddim(y_bst_dims_vec));
-      dy_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate x's conjugate for complex
-      Tensor x_conj(x->type());
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, x->numel());
-      phi::funcs::ConjFunctor<T> x_functor(
-          x->data<T>(), x->numel(),
-          x_conj.mutable_data<T>(x->dims(), dev_ctx.GetPlace()));
-      x_for_range(x_functor);
-
-      // reuse forward to get dy_bst, and the result has been broadcated.
-      triangular_solve<DeviceContext, T>(dev_ctx, x_conj, *dout, &dy_bst, upper,
-                                         !transpose, unitriangular);
-
-      if (dy_bst.dims() == dy->dims()) {
-        framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dy_bst, dy, ctx);
-        dy->Resize(y->dims());
-      }
-    }
-
-    Tensor dx_bst(x->type());
-    if (dx) {
-      dx->mutable_data<T>(x->dims(), dev_ctx.GetPlace());
-      dx_bst.Resize(phi::make_ddim(x_bst_dims_vec));
-      dx_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate out's conjugate for complex
-      Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-      if (transpose) {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
-        blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      } else {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
-        blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      }
-
-      Tensor dx_bst_upper(x->type());
-      // get upper or lower triangular
-      dx_bst_upper.Resize(dx_bst.dims());
-      dx_bst_upper.mutable_data<T>(dev_ctx.GetPlace());
-
-      const auto& dims = dx_bst.dims();
-      const auto H = dims[dims.size() - 2];
-      const auto W = dims[dims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, dx_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(dx_bst.data<T>(), unitriangular,
-                                            !upper, H, W,
-                                            dx_bst_upper.data<T>());
-      x_for_range(tril_triu_computer);
-
-      if (dx_bst_upper.dims() == dx->dims()) {
-        framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dx_bst_upper, dx, ctx);
-        dx->Resize(x->dims());
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 03128e96a83..c017e5864aa 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -274,6 +274,65 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor X's dimensions of TriangularSolveOp "
+                        "should be >= 2. But received X's "
+                        "dimensions = %d, X's shape = [%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor Y's dimensions of TriangularSolveOp "
+                        "should be >=2. But received Y's "
+                        "dimensions = %d, Y's shape = [%s]",
+                        y_dims.size(),
+                        y_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
+                    x_dims[x_dims_n - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        x_dims[x_dims_n - 2],
+                        x_dims[x_dims_n - 1]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+  y_broadcast_dims.insert(y_broadcast_dims.end(),
+                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+
+  // dim of 'out' is the same with 'Y' after broadcast
+  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dtype(y.dtype());
+  out->set_layout(y.layout());
+  out->share_lod(y);
+}
+
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index f397c0def8a..976c17cd8d9 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -62,6 +62,13 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out);
+
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 4ffa1826a29..e9108787082 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -18,10 +18,11 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 # NOTE: Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
 
 # auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
diff --git a/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
new file mode 100644
index 00000000000..80b2015f731
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
new file mode 100644
index 00000000000..5aca5be1279
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // Calculate use blas library
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    blas.TRSM(CblasLeft,
+              upper ? CblasUpper : CblasLower,
+              transpose ? CblasTrans : CblasNoTrans,
+              unitriangular ? CblasUnit : CblasNonUnit,
+              M,
+              N,
+              T(1),
+              x_bst_data + i * M * M,
+              std::max(1, M),
+              out_data + i * N * M,
+              std::max(1, N));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 8b8697b6df1..02cba6009c4 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,3 +8,4 @@ math_library(sequence2batch)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 math_library(concat_and_split_functor DEPS dense_tensor)
+math_library(matrix_reduce DEPS dense_tensor)
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index dce80caab72..139341536de 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -140,6 +140,72 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
   return true;
 }
 
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out [5, 3, 2], which is batch_size of matrix
+static inline std::vector<int64_t> MatrixGetBroadcastBatchPortion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        (x_size == y_size || x_size == 1 || y_size == 1),
+        true,
+        phi::errors::PreconditionNotMet(
+            "The size of tensor x (%d) must match the size of tensor y "
+            "(%d) at non-singleton dimension %d.",
+            x_size,
+            y_size,
+            i));
+
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
+  }
+  return batchPortion;
+}
+
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out shoule be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is
+// batch_size of matrix
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) {
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+
+  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
+  std::vector<int64_t> x_dims_vec_cut(f1, l1);
+
+  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
+  std::vector<int64_t> y_dims_vec_cut(f2, l2);
+
+  std::vector<int64_t> expand_batch_portion =
+      MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_expand_size({expand_batch_portion});
+  x_expand_size.insert(x_expand_size.end(),
+                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
+                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
+
+  std::vector<int64_t> y_expand_size({expand_batch_portion});
+  y_expand_size.insert(y_expand_size.end(),
+                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
+                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
+
+  return std::make_tuple(x_expand_size, y_expand_size);
+}
+
 inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
   if (s_dims.size() > l_dims.size()) {
     return GetOutputDims(l_dims, s_dims);
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc
new file mode 100644
index 00000000000..849fd7a0075
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int64_t> in_dims = phi::vectorize<int64_t>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int64_t> out_dims = phi::vectorize<int64_t>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int64_t> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    phi::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
+        dev_ctx, in, out, out_reduce_dims, true, false);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, CPUContext>;
+template class MatrixReduceSumFunctor<double, CPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu
new file mode 100644
index 00000000000..5e288c6e9c2
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int> in_dims = phi::vectorize<int>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int> out_dims = phi::vectorize<int>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx,
+        in,
+        out,
+        kps::IdentityFunctor<T>(),
+        out_reduce_dims,
+        dev_ctx.stream());
+  }
+};
+
+template class MatrixReduceSumFunctor<float, GPUContext>;
+template class MatrixReduceSumFunctor<double, GPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.h b/paddle/phi/kernels/funcs/matrix_reduce.h
new file mode 100644
index 00000000000..22bddacd43d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
+
+// Use For Matrix OP, reduce_sum 'in' according to out's dim
+// for example: in's dim = [5, 3, 2, M, N] ; out's dim = [3, 1, M, N]
+// axis [0, 2] of DenseTensor 'in' will be reduced
+template <typename T, typename Context>
+class MatrixReduceSumFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out);
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
new file mode 100644
index 00000000000..f7eaa485797
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
new file mode 100644
index 00000000000..f137d8e1c26
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // calculate use cublas library
+  CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
+  CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
+  CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
+
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  auto lda = std::max(1, M);
+  auto ldb = std::max(1, N);
+
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
+  if (batch_size <= 8 && M >= 64) {
+    for (auto i = 0; i < batch_size; i++) {
+      blas.TRSM(CblasLeft,
+                uplo,
+                transA,
+                diag,
+                M,
+                N,
+                T(1),
+                x_bst_data + i * M * M,
+                lda,
+                out_data + i * N * M,
+                ldb);
+    }
+  } else {
+    std::vector<const T*> cpu_ptrs(batch_size * 2);
+    for (int i = 0; i < batch_size; ++i) {
+      cpu_ptrs[i] = x_bst_data + i * M * M;
+      cpu_ptrs[i + batch_size] = out_data + i * M * N;
+    }
+
+    // Copy the addresses of A and tmp_b from host to device.
+    paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+        paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_ptrs_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         static_cast<void*>(cpu_ptrs.data()),
+                         cpu_ptrs.size() * sizeof(T*),
+                         dev_ctx.stream());
+
+    const T** gpu_a_ptrs =
+        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
+    T** gpu_b_ptrs =
+        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+    blas.BatchedTRSM(CblasLeft,
+                     uplo,
+                     transA,
+                     diag,
+                     M,
+                     N,
+                     static_cast<T>(1.0),
+                     gpu_a_ptrs,
+                     lda,
+                     gpu_b_ptrs,
+                     ldb,
+                     batch_size);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
new file mode 100644
index 00000000000..a6868ebe6ca
--- /dev/null
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy) {
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+
+  ScalarArray y_bst_dims_array(y_bst_dims_vec);
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims_array);
+  if (dy) {
+    // calculate x's conjugate for complex
+    DenseTensor x_conj = phi::Empty<T, Context>(dev_ctx);
+    x_conj.Resize(x.dims());
+
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, x.numel());
+    phi::funcs::ConjFunctor<T> x_functor(
+        x.data<T>(), x.numel(), dev_ctx.template Alloc<T>(&x_conj));
+    x_for_range(x_functor);
+
+    // reuse forward to get dy_bst, and the result has been broadcated already.
+    TriangularSolveKernel<T, Context>(
+        dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst);
+
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+    if (dy_bst.dims() == y.dims()) {
+      Copy<Context>(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dy_bst, dy);
+      dy->Resize(y.dims());
+    }
+  }
+
+  ScalarArray x_bst_dims_array(x_bst_dims_vec);
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+  if (dx) {
+    // calculate x's conjugate for complex
+    DenseTensor out_conj = phi::Empty<T, Context>(dev_ctx);
+    out_conj.Resize(out.dims());
+
+    phi::funcs::ForRange<Context> out_for_range(dev_ctx, out.numel());
+    phi::funcs::ConjFunctor<T> out_functor(
+        out.data<T>(), out.numel(), dev_ctx.template Alloc<T>(&out_conj));
+    out_for_range(out_functor);
+
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+    if (transpose) {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
+      blas.MatMul(out_conj,
+                  mat_dim_a,
+                  dy_bst,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    } else {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
+      blas.MatMul(dy_bst,
+                  mat_dim_a,
+                  out_conj,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    }
+
+    // get upper or lower triangular
+    DenseTensor dx_bst_upper =
+        phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+
+    const auto& dims = dx_bst.dims();
+    const auto H = dims[dims.size() - 2];
+    const auto W = dims[dims.size() - 1];
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
+    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+        dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
+    x_for_range(tril_triu_functor);
+
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+    if (dx_bst.dims() == x.dims()) {
+      Copy<Context>(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dx_bst_upper, dx);
+      dx->Resize(x.dims());
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h
new file mode 100644
index 00000000000..eb5a5ab461a
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_kernel.h b/paddle/phi/kernels/triangular_solve_kernel.h
new file mode 100644
index 00000000000..833de3f8439
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc
new file mode 100644
index 00000000000..c56af3e21e5
--- /dev/null
+++ b/paddle/phi/ops/compat/triangular_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TriangularSolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("triangular_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper", "transpose", "unitriangular"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad,
+                           phi::TriangularSolveGradOpArgumentMapping);
-- 
GitLab


From 7e076e7b750b20c27c1f230b14bc794c2231c897 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Sun, 6 Mar 2022 09:36:54 +0800
Subject: [PATCH 142/272] [PHI] Move dist op to phi (#40178)

* move dist op to phi

* fix

* fix

* fix as reviews
---
 paddle/fluid/operators/dist_op.cc             |  17 +-
 paddle/fluid/operators/dist_op.h              | 304 ------------------
 paddle/phi/infermeta/binary.cc                |  23 ++
 paddle/phi/infermeta/binary.h                 |   5 +
 paddle/phi/kernels/cpu/dist_grad_kernel.cc    |  22 ++
 paddle/phi/kernels/cpu/dist_kernel.cc         |  21 ++
 paddle/phi/kernels/dist_grad_kernel.h         |  31 ++
 paddle/phi/kernels/dist_kernel.h              |  28 ++
 paddle/phi/kernels/gpu/dist_grad_kernel.cu    |  26 ++
 .../kernels/gpu/dist_kernel.cu}               |  21 +-
 .../phi/kernels/impl/dist_grad_kernel_impl.h  | 223 +++++++++++++
 paddle/phi/kernels/impl/dist_kernel_impl.h    | 164 ++++++++++
 paddle/phi/ops/compat/dist_sig.cc             |  28 ++
 13 files changed, 588 insertions(+), 325 deletions(-)
 delete mode 100644 paddle/fluid/operators/dist_op.h
 create mode 100644 paddle/phi/kernels/cpu/dist_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/dist_kernel.cc
 create mode 100644 paddle/phi/kernels/dist_grad_kernel.h
 create mode 100644 paddle/phi/kernels/dist_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/dist_grad_kernel.cu
 rename paddle/{fluid/operators/dist_op.cu => phi/kernels/gpu/dist_kernel.cu} (51%)
 create mode 100644 paddle/phi/kernels/impl/dist_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/dist_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/dist_sig.cc

diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 3a53f136556..10750574c45 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor,
+                            PT_INFER_META(phi::DistInferMeta));
+
 REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker,
                   ops::DistGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DistGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DistGradOpMaker<paddle::imperative::OpBase>,
+                  DistInferShapeFunctor);
 REGISTER_OPERATOR(dist_grad, ops::DistOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
deleted file mode 100644
index dfd7e29a8d0..00000000000
--- a/paddle/fluid/operators/dist_op.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-template <int Rank>
-static void GetBraodcastDims(const framework::DDim& x_dims,
-                             const framework::DDim& y_dims,
-                             Eigen::DSizes<int, Rank>* x_bcast_dims,
-                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
-  int bcast_dims_remainder = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (x_dims[i] >= y_dims[i]) {
-      (*x_bcast_dims)[i] = 1;
-      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
-      bcast_dims_remainder += x_dims[i] % y_dims[i];
-    } else {
-      (*y_bcast_dims)[i] = 1;
-      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-  }
-  PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
-                    platform::errors::PreconditionNotMet(
-                        "The input tensor of Op(dist) could not be broadcast, "
-                        "X's shape is [%s], Y's shape is [%s].",
-                        x_dims, y_dims));
-}
-
-static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) {
-  std::vector<int64_t> new_dims_vec(rank);
-  if (in_dims.size() < rank) {
-    for (int i = 0; i < rank - in_dims.size(); ++i) {
-      new_dims_vec[i] = 1;
-    }
-    for (int i = 0; i < in_dims.size(); ++i) {
-      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
-    }
-  } else {
-    new_dims_vec = vectorize(in_dims);
-  }
-  return phi::make_ddim(new_dims_vec);
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Output<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-  out->mutable_data<T>(context.GetPlace());
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-
-  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, 1>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  // p=0 means number of non-zero elements of (x-y)
-  // p=inf means the maximum of |x-y|
-  // p=-inf means the minimum of |x-y|
-  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
-  if (p == 0) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
-            .template cast<T>()
-            .sum();
-  } else if (p == INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .maximum();
-  } else if (p == -INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .minimum();
-  } else {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .pow(p)
-            .sum()
-            .pow(1.0 / p);
-  }
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistGradFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Input<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-
-  auto x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-  auto y_grad = context.Output<Tensor>(framework::GradVarName("Y"));
-  auto out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-  auto out_dims = context.Input<Tensor>("Out")->dims();
-
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-  framework::DDim out_new_dims = GetNewDims(out_dims, Rank);
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, Rank>::From(*out, out_new_dims);
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  Eigen::DSizes<int, Rank> out_bcast_dims;
-
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  std::vector<int64_t> new_dims_vec(Rank);
-  for (int i = 0; i < Rank; ++i) {
-    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
-    out_bcast_dims[i] = new_dims_vec[i];
-  }
-  framework::DDim new_dims = phi::make_ddim(new_dims_vec);
-
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  auto out_grad_t = EigenTensor<T, Rank>::From(*out_grad, out_new_dims);
-  framework::Tensor grad;
-  grad.mutable_data<T>(new_dims, context.GetPlace());
-  auto grad_t = EigenTensor<T, Rank>::From(grad);
-
-  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
-  auto x_minux_y_abs = x_minux_y.abs();
-  auto sign =
-      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
-      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
-  T epsilon = static_cast<T>(1.0e-10f);
-
-  // 1: Lp-norm(z), z = x-y, compute dz
-  if (p == 0) {
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, &grad, static_cast<T>(0));
-  } else if (p == INFINITY || p == -INFINITY) {
-    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
-    // j!=i, or equals to sign(z_i) * dout if j=i.
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  } else {
-    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  }
-
-  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
-  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
-  Eigen::DSizes<int, Rank> reduce_dims;
-  for (int i = 0; i < x_new_dims.size(); ++i) {
-    x_reshape_dims[2 * i] = x_bcast_dims[i];
-    x_reshape_dims[2 * i + 1] = x_new_dims[i];
-    y_reshape_dims[2 * i] = y_bcast_dims[i];
-    y_reshape_dims[2 * i + 1] = y_new_dims[i];
-    reduce_dims[i] = 2 * i;
-  }
-
-  // 2: if x or y is broadcasted in forward function,
-  // the grad need to be sum along the broadcasted dimensions
-  if (x_grad) {
-    x_grad->mutable_data<T>(context.GetPlace());
-    auto x_grad_t = EigenTensor<T, Rank>::From(*x_grad, x_new_dims);
-    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(x_grad_t.dimensions());
-  }
-  if (y_grad) {
-    y_grad->mutable_data<T>(context.GetPlace());
-    auto y_grad_t = EigenTensor<T, Rank>::From(*y_grad, y_new_dims);
-    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
-                                  .sum(reduce_dims)
-                                  .reshape(y_grad_t.dimensions());
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DistKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DistGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistGradFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index c017e5864aa..94b489906c6 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -456,6 +456,29 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) has not been initialized properly. The "
+                        "shape of Input(X) = [%s].",
+                        x_dims));
+  PADDLE_ENFORCE_NE(phi::product(y_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) has not been initialized properly. The "
+                        "shape of Input(Y) = [%s].",
+                        y_dims));
+  out->set_dims({1});
+  out->set_dtype(x.dtype());
+}
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out) {
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 976c17cd8d9..caf9185c900 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -85,6 +85,11 @@ void BCELossInferMeta(const MetaTensor& input,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out);
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out);
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
new file mode 100644
index 00000000000..2b7f8f98f94
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc
new file mode 100644
index 00000000000..ccf3d4be832
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
diff --git a/paddle/phi/kernels/dist_grad_kernel.h b/paddle/phi/kernels/dist_grad_kernel.h
new file mode 100644
index 00000000000..1f8d7ff21f2
--- /dev/null
+++ b/paddle/phi/kernels/dist_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h
new file mode 100644
index 00000000000..6cb3d6e0e8b
--- /dev/null
+++ b/paddle/phi/kernels/dist_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
new file mode 100644
index 00000000000..c458f8cce3e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+#endif
diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
similarity index 51%
rename from paddle/fluid/operators/dist_op.cu
rename to paddle/phi/kernels/gpu/dist_kernel.cu
index 90674969e28..87e75e02754 100644
--- a/paddle/fluid/operators/dist_op.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,21 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-namespace ops = paddle::operators;
 #ifdef PADDLE_WITH_HIP
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>);
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {}
 #else
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
new file mode 100644
index 00000000000..fc118a832dc
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
@@ -0,0 +1,223 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistGradFunction(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             float p,
+                             DenseTensor* x_grad,
+                             DenseTensor* y_grad) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto out_dims = out.dims();
+
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+  phi::DDim out_new_dims = GetNewDims(out_dims, Rank);
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, Rank>::From(out, out_new_dims);
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  Eigen::DSizes<int, Rank> out_bcast_dims;
+
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  std::vector<int64_t> new_dims_vec(Rank);
+  for (int i = 0; i < Rank; ++i) {
+    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
+    out_bcast_dims[i] = new_dims_vec[i];
+  }
+  phi::DDim new_dims = phi::make_ddim(new_dims_vec);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto out_grad_t = ETensor<T, Rank>::From(out_grad, out_new_dims);
+  DenseTensor grad;
+  grad.Resize(new_dims);
+  dev_ctx.template Alloc<T>(&grad);
+  auto grad_t = ETensor<T, Rank>::From(grad);
+
+  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
+  auto x_minux_y_abs = x_minux_y.abs();
+  auto sign =
+      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
+      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
+
+  // 1: Lp-norm(z), z = x-y, compute dz
+  if (p == 0) {
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, &grad, static_cast<T>(0));
+  } else if (p == INFINITY || p == -INFINITY) {
+    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
+    // j!=i, or equals to sign(z_i) * dout if j=i.
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  } else {
+    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  }
+
+  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
+  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
+  Eigen::DSizes<int, Rank> reduce_dims;
+  for (int i = 0; i < x_new_dims.size(); ++i) {
+    x_reshape_dims[2 * i] = x_bcast_dims[i];
+    x_reshape_dims[2 * i + 1] = x_new_dims[i];
+    y_reshape_dims[2 * i] = y_bcast_dims[i];
+    y_reshape_dims[2 * i + 1] = y_new_dims[i];
+    reduce_dims[i] = 2 * i;
+  }
+
+  // 2: if x or y is broadcasted in forward function,
+  // the grad need to be sum along the broadcasted dimensions
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    auto x_grad_t = ETensor<T, Rank>::From(*x_grad, x_new_dims);
+    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(x_grad_t.dimensions());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    auto y_grad_t = ETensor<T, Rank>::From(*y_grad, y_new_dims);
+    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
+                                  .sum(reduce_dims)
+                                  .reshape(y_grad_t.dimensions());
+  }
+}
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistGradFunction<Context, T, 1>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 2:
+      DistGradFunction<Context, T, 2>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 3:
+      DistGradFunction<Context, T, 3>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 4:
+      DistGradFunction<Context, T, 4>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 5:
+      DistGradFunction<Context, T, 5>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 6:
+      DistGradFunction<Context, T, 6>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
new file mode 100644
index 00000000000..397fc1b9224
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_kernel_impl.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistFunction(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         float p,
+                         DenseTensor* out) {
+  if (out) {
+    dev_ctx.template Alloc<T>(out);
+  }
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, 1>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  // p=0 means number of non-zero elements of (x-y)
+  // p=inf means the maximum of |x-y|
+  // p=-inf means the minimum of |x-y|
+  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
+  if (p == 0) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
+            .template cast<T>()
+            .sum();
+  } else if (p == INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .maximum();
+  } else if (p == -INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .minimum();
+  } else {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .pow(p)
+            .sum()
+            .pow(1.0 / p);
+  }
+}
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistFunction<Context, T, 1>(dev_ctx, x, y, p, out);
+      break;
+    case 2:
+      DistFunction<Context, T, 2>(dev_ctx, x, y, p, out);
+      break;
+    case 3:
+      DistFunction<Context, T, 3>(dev_ctx, x, y, p, out);
+      break;
+    case 4:
+      DistFunction<Context, T, 4>(dev_ctx, x, y, p, out);
+      break;
+    case 5:
+      DistFunction<Context, T, 5>(dev_ctx, x, y, p, out);
+      break;
+    case 6:
+      DistFunction<Context, T, 6>(dev_ctx, x, y, p, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc
new file mode 100644
index 00000000000..18a30b9b840
--- /dev/null
+++ b/paddle/phi/ops/compat/dist_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("dist_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"p"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping);
-- 
GitLab


From d30d85dafb364a25807422bcd587e45917e09254 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 6 Mar 2022 13:31:36 +0800
Subject: [PATCH 143/272] [Phi] Replace all prefix PT by PD and fix typo
 (#40046)

* replace prefix pt by pd

* replace added kernel

* revert util change

* pd kernel to phi

* resolve conflict

* resolve conflict
---
 paddle/fluid/framework/infershape_utils.h     |   2 +-
 .../fluid/framework/infershape_utils_test.cc  |   4 +-
 paddle/fluid/operators/abs_op.cc              |   4 +-
 paddle/fluid/operators/addmm_op.cc            |   4 +-
 paddle/fluid/operators/atan2_op.cc            |   4 +-
 paddle/fluid/operators/bce_loss_op.cc         |   4 +-
 .../operators/bilinear_tensor_product_op.cc   |   8 +-
 .../fluid/operators/broadcast_tensors_op.cc   |   4 +-
 paddle/fluid/operators/cholesky_op.cc         |   4 +-
 paddle/fluid/operators/concat_op.cc           |   4 +-
 paddle/fluid/operators/conj_op.cc             |   4 +-
 .../operators/controlflow/compare_all_op.cc   |   4 +-
 .../fluid/operators/controlflow/compare_op.cc |   4 +-
 paddle/fluid/operators/cross_op.cc            |   4 +-
 paddle/fluid/operators/diag_v2_op.cc          |   4 +-
 paddle/fluid/operators/diagonal_op.cc         |   4 +-
 paddle/fluid/operators/dist_op.cc             |   4 +-
 paddle/fluid/operators/dot_op.cc              |   4 +-
 paddle/fluid/operators/empty_op.cc            |   4 +-
 paddle/fluid/operators/erfinv_op.cc           |   4 +-
 paddle/fluid/operators/eye_op.cc              |   4 +-
 paddle/fluid/operators/gather_nd_op.cc        |   8 +-
 paddle/fluid/operators/gather_tree_op.cc      |   4 +-
 paddle/fluid/operators/gumbel_softmax_op.cc   |   8 +-
 paddle/fluid/operators/huber_loss_op.cc       |   4 +-
 paddle/fluid/operators/imag_op.cc             |   4 +-
 paddle/fluid/operators/increment_op.cc        |   4 +-
 paddle/fluid/operators/index_sample_op.cc     |   4 +-
 paddle/fluid/operators/lerp_op.cc             |   4 +-
 paddle/fluid/operators/matmul_v2_op.cc        |   4 +-
 paddle/fluid/operators/multinomial_op.cc      |   4 +-
 paddle/fluid/operators/mv_op.cc               |   4 +-
 paddle/fluid/operators/pixel_shuffle_op.cc    |   4 +-
 paddle/fluid/operators/poisson_op.cc          |   4 +-
 paddle/fluid/operators/real_op.cc             |   4 +-
 .../operators/reduce_ops/reduce_mean_op.cc    |   4 +-
 .../operators/reduce_ops/reduce_sum_op.cc     |   4 +-
 paddle/fluid/operators/scale_op.cc            |   4 +-
 paddle/fluid/operators/scatter_nd_add_op.cc   |   8 +-
 paddle/fluid/operators/scatter_op.cc          |   8 +-
 paddle/fluid/operators/selu_op.cc             |   4 +-
 paddle/fluid/operators/sign_op.cc             |   4 +-
 paddle/fluid/operators/size_op.cc             |   4 +-
 paddle/fluid/operators/trace_op.cc            |   4 +-
 paddle/fluid/operators/triangular_solve_op.cc |   4 +-
 paddle/fluid/operators/trunc_op.cc            |   4 +-
 paddle/fluid/operators/unfold_op.cc           |   4 +-
 paddle/fluid/operators/where_op.cc            |   4 +-
 paddle/phi/common/data_type.h                 |  14 +-
 paddle/phi/core/compat/op_utils.h             |   8 +-
 paddle/phi/core/infermeta_utils.h             |  40 +--
 paddle/phi/core/kernel_registry.h             | 310 +++++++++---------
 paddle/phi/core/kernel_utils.h                |  99 +++---
 paddle/phi/core/macros.h                      |  18 +-
 54 files changed, 350 insertions(+), 349 deletions(-)

diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 64c8371d583..b692b6ffab0 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -29,7 +29,7 @@ namespace framework {
 phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
                                             const std::string& op_type);
 
-#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
+#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
   struct functor_name : public paddle::framework::InferShapeBase {  \
     void operator()(                                                \
         paddle::framework::InferShapeContext* ctx) const override { \
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 53dcc19fcba..2eeefb19a1a 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel(
 }  // namespace framework
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
+DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
                             InferShapeUtilsTestInferShapeFunctor,
-                            PT_INFER_META(paddle::framework::TestInferMeta));
+                            PD_INFER_META(paddle::framework::TestInferMeta));
 REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOp,
                   paddle::framework::InferShapeUtilsTestOpMaker,
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index c28026a4bd4..e1460629fb1 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index de4d7818020..716a2e40179 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -147,8 +147,8 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
-                            PT_INFER_META(phi::AddmmInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
+                            PD_INFER_META(phi::AddmmInferMeta));
 REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::framework::OpDesc>,
                   ops::AddMMOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc
index 71a895c244c..0783b30a858 100644
--- a/paddle/fluid/operators/atan2_op.cc
+++ b/paddle/fluid/operators/atan2_op.cc
@@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
-                            PT_INFER_META(phi::Atan2InferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
+                            PD_INFER_META(phi::Atan2InferMeta));
 REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker,
                   ops::Atan2GradMaker<paddle::framework::OpDesc>,
                   ops::Atan2GradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 55bb57466c7..bc9076f4d7c 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
-                            PT_INFER_META(phi::BCELossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
+                            PD_INFER_META(phi::BCELossInferMeta));
 
 REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker,
                   ops::BCELossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 4774c0a1dbc..9f6a78ab7a5 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
+DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
                             BilinearTensorProductInferShapeFunctor,
-                            PT_INFER_META(phi::BilinearTensorProductInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(
+                            PD_INFER_META(phi::BilinearTensorProductInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
     bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor,
-    PT_INFER_META(phi::BilinearTensorProductGradInferMeta));
+    PD_INFER_META(phi::BilinearTensorProductGradInferMeta));
 
 REGISTER_OPERATOR(
     bilinear_tensor_product, ops::BilinearTensorProductOp,
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index c3917fad555..1063a8b7992 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -167,9 +167,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
                             BroadcastTensorsInferShapeFunctor,
-                            PT_INFER_META(phi::BroadcastTensorsInferMeta));
+                            PD_INFER_META(phi::BroadcastTensorsInferMeta));
 
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 09e915a6baf..ed80ac076c0 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
-                            PT_INFER_META(phi::CholeskyInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskyInferMeta));
 REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
                   ops::CholeskyGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 1da7798ea26..059fafa3e7f 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -205,8 +205,8 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
-                            PT_INFER_META(phi::ConcatInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PD_INFER_META(phi::ConcatInferMeta));
 
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                   ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 95135ba3b1a..cbec1182f20 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ops::ConjGradMaker<paddle::framework::OpDesc>,
                   ops::ConjGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index 9f229e6f15c..dd407f4f6f3 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -58,8 +58,8 @@ class CompareReduceOp : public framework::OperatorWithKernel {
   };                                                                       \
   char _##op_type##Comment::type[]{#op_type};                              \
   char _##op_type##Comment::equation[]{_equation};                         \
-  DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
-                              PT_INFER_META(phi::CompareAllInferMeta));    \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
+                              PD_INFER_META(phi::CompareAllInferMeta));    \
   REGISTER_OPERATOR(                                                       \
       op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>,  \
       ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 5d9cdc61769..72d81d8c3fd 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -96,8 +96,8 @@ class CompareOp : public framework::OperatorWithKernel {
   };                                                                     \
   char _##op_type##Comment::type[]{#op_type};                            \
   char _##op_type##Comment::equation[]{_equation};                       \
-  DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
-                              PT_INFER_META(phi::CompareInferMeta));     \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
+                              PD_INFER_META(phi::CompareInferMeta));     \
   REGISTER_OPERATOR(                                                     \
       op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,      \
       ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,     \
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index fe00ee06603..674b75625d1 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
-                            PT_INFER_META(phi::CrossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
+                            PD_INFER_META(phi::CrossInferMeta));
 REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker,
                   ops::CrossGradMaker<paddle::framework::OpDesc>,
                   ops::CrossGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 0160277dc79..93fbff67e22 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -62,8 +62,8 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
-                            PT_INFER_META(phi::DiagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
+                            PD_INFER_META(phi::DiagInferMeta));
 
 REGISTER_OPERATOR(
     diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc
index 20813f8bb44..bf3cc941539 100644
--- a/paddle/fluid/operators/diagonal_op.cc
+++ b/paddle/fluid/operators/diagonal_op.cc
@@ -105,8 +105,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
-                            PT_INFER_META(phi::DiagonalInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
+                            PD_INFER_META(phi::DiagonalInferMeta));
 
 REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker,
                   ops::DiagonalGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 10750574c45..55b24849412 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -124,8 +124,8 @@ class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor,
-                            PT_INFER_META(phi::DistInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor,
+                            PD_INFER_META(phi::DistInferMeta));
 
 REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker,
                   ops::DistGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index a86a3bb3592..8efdd15781a 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -101,8 +101,8 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
-                            PT_INFER_META(phi::DotInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PD_INFER_META(phi::DotInferMeta));
 
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                   ops::DotOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 96fa3282d06..9e0e4e7fe1c 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -88,8 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
-                            PT_INFER_META(phi::CreateInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
+                            PD_INFER_META(phi::CreateInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker,
                              ops::EmptyOpVarTypeInference,
                              EmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
index 3d409b4c4f6..374b0079262 100644
--- a/paddle/fluid/operators/erfinv_op.cc
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(
     erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index f8c6b4eb8c5..537c218d357 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -67,8 +67,8 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
-                            PT_INFER_META(phi::EyeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
+                            PD_INFER_META(phi::EyeInferMeta));
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index fcd3384ac24..e5ca15a39ef 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -130,11 +130,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X");
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
-                            PT_INFER_META(phi::GatherNdInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdInferMeta));
 
-DELCARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
-                            PT_INFER_META(phi::GatherNdGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdGradInferMeta));
 
 REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
                   ops::GatherNdGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 7f6c82032fe..c84e94f5c71 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -61,8 +61,8 @@ selected ids.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
-                            PT_INFER_META(phi::GatherTreeMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PD_INFER_META(phi::GatherTreeMeta));
 
 REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
                   GatherTreeInferShapeFunctor);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc
index f8f8f3fd789..524f2d6c9d7 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cc
+++ b/paddle/fluid/operators/gumbel_softmax_op.cc
@@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::GumbelSoftmaxInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
                             GumbelSoftmaxGradInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxGradInferMeta));
+                            PD_INFER_META(phi::GumbelSoftmaxGradInferMeta));
 
 REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp,
                   ops::GumbelSoftmaxOpMaker,
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 3915ce5809c..3c9bbc753f2 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
-                            PT_INFER_META(phi::HuberLossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PD_INFER_META(phi::HuberLossInferMeta));
 
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 567a69f383d..16968876ac9 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
-                            PT_INFER_META(phi::RealAndImagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 105d818e197..e2efaa1759b 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
-                            PT_INFER_META(phi::IncrementInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
+                            PD_INFER_META(phi::IncrementInferMeta));
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
                   ops::IncrementGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 68d002fceea..d17c6368c75 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -100,8 +100,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
-                            PT_INFER_META(phi::IndexSampleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                   ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
                   ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc
index fef6fc5319e..5e053445379 100644
--- a/paddle/fluid/operators/lerp_op.cc
+++ b/paddle/fluid/operators/lerp_op.cc
@@ -85,8 +85,8 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
-                            PT_INFER_META(phi::LerpInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
+                            PD_INFER_META(phi::LerpInferMeta));
 REGISTER_OPERATOR(
     lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker,
     paddle::operators::LerpOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 788dbb22041..01fa01e3c6e 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
                   ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
 
-DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
-                            PT_INFER_META(phi::GeneralBinaryGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
 REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad,
                   ops::MatMulV2OpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2OpDoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 1143f9cb37a..0113f638b9a 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
-                            PT_INFER_META(phi::MultinomialInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
+                            PD_INFER_META(phi::MultinomialInferMeta));
 REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index d34a1ebf82c..bf7222fc45c 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -94,8 +94,8 @@ class MVOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor,
-                            PT_INFER_META(phi::MvInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor,
+                            PD_INFER_META(phi::MvInferMeta));
 
 REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 2a127d9ad1d..21ca26f49f6 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
-                            PT_INFER_META(phi::PixelShuffleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
+                            PD_INFER_META(phi::PixelShuffleInferMeta));
 
 REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index 0cecbf0b9cb..d5896c41059 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
                   ops::PoissonOpInferVarType,
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 28a8484f539..18e444702fb 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
-                            PT_INFER_META(phi::RealAndImagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 6157a3a925d..894106883cb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -96,8 +96,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
 };
 
-DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
-                            PT_INFER_META(phi::MeanRawInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
+                            PD_INFER_META(phi::MeanRawInferMeta));
 
 REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
                   ops::ReduceMeanOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 8ef0712dc7a..6559ed479c8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -102,8 +102,8 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_sum"; }
 };
 
-DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
-                            PT_INFER_META(phi::ReduceInferMetaBase));
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
 REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumVarTypeInference,
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index e4410b21b54..cbf2b915207 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index b7be4cfb2a3..0ae0e1500c1 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -119,12 +119,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterNdAddInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterNdAddInferMeta));
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
                             ScatterNdAddGradInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterNdAddGradInferMeta));
+                            PD_INFER_META(phi::ScatterNdAddGradInferMeta));
 
 REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker,
                   ops::ScatterNdAddGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index fec003305fd..5f6b04cf59e 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -103,11 +103,11 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterInferMeta));
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterGradInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 0372a79b967..59c6e165357 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -120,8 +120,8 @@ class SeluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
                   ops::SeluGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index e2381c76f7e..ceb42dcf3e5 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index e584c1a4cce..84b0f403be0 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -44,8 +44,8 @@ Return the number of elements in the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
-                            PT_INFER_META(phi::SizeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
+                            PD_INFER_META(phi::SizeInferMeta));
 REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 63b914a31a8..0590b66f6f8 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -107,8 +107,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
-                            PT_INFER_META(phi::TraceInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
+                            PD_INFER_META(phi::TraceInferMeta));
 REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::framework::OpDesc>,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index 179f818104c..df84659a00f 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -120,8 +120,8 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor,
-                            PT_INFER_META(phi::TriangularSolveInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor,
+                            PD_INFER_META(phi::TriangularSolveInferMeta));
 
 REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp,
                   ops::TriangularSolveOpMaker,
diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc
index 54f4deac80a..b77775f5a8c 100644
--- a/paddle/fluid/operators/trunc_op.cc
+++ b/paddle/fluid/operators/trunc_op.cc
@@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker,
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index c45b839d5b4..02fed3de6ce 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
-                            PT_INFER_META(phi::UnfoldInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
+                            PD_INFER_META(phi::UnfoldInferMeta));
 REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
                   ops::UnfoldGradMaker<paddle::framework::OpDesc>,
                   ops::UnfoldGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
index 0f10efefa13..acbfee30670 100644
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -117,8 +117,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
-                            PT_INFER_META(phi::WhereInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PD_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                   ops::WhereOpGradMaker<paddle::framework::OpDesc>,
                   ops::WhereOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index d9dc103e48e..38239f0fa9d 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -82,7 +82,7 @@ inline size_t SizeOf(DataType data_type) {
   return 0;
 }
 
-#define PT_FOR_EACH_DATA_TYPE(_)    \
+#define PD_FOR_EACH_DATA_TYPE(_)    \
   _(bool, DataType::BOOL)           \
   _(int8_t, DataType::INT8)         \
   _(uint8_t, DataType::UINT8)       \
@@ -105,25 +105,25 @@ struct DataTypeToCppType;
 template <typename T>
 struct CppTypeToDataType;
 
-#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+#define PD_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
   template <>                                                \
   struct DataTypeToCppType<data_type> {                      \
     using type = cpp_type;                                   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCppType)
 
-#undef PT_SPECIALIZE_DataTypeToCppType
+#undef PD_SPECIALIZE_DataTypeToCppType
 
-#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+#define PD_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
   template <>                                                \
   struct CppTypeToDataType<cpp_type> {                       \
     constexpr static DataType Type() { return data_type; }   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_CppTypeToDataType)
 
-#undef PT_SPECIALIZE_CppTypeToDataType
+#undef PD_SPECIALIZE_CppTypeToDataType
 
 inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   switch (dtype) {
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index bbf634b4b09..8f64a7145ed 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -166,7 +166,7 @@ struct ArgumentMappingFnRegistrar {
 };
 
 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
       PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
       "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
@@ -174,7 +174,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_ai_name_ns_check_##op_type,                              \
       "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
@@ -182,7 +182,7 @@ struct ArgumentMappingFnRegistrar {
       TouchBaseKernelNameSymbol_##op_type()
 
 #define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
       PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
       "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
@@ -190,7 +190,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
       PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
       "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index a5775db7438..9c351ce9063 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -86,10 +86,10 @@ class InferMetaContext {
   paddle::SmallVector<std::pair<int, int>> output_range_;
 };
 
-#define PT_INFER_META(...) \
+#define PD_INFER_META(...) \
   ::phi::InferMetaFnImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Call
 
-#define PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                                  \
   struct InferMetaFnCallHelper<attr_type, Tail...> {                           \
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> \
@@ -175,24 +175,24 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   // TODO(chenweihang): support other attr type later
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<int64_t>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<std::string>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
 
   // TODO(chenweihang): support vector<MetaTensor> input later
 
@@ -304,11 +304,11 @@ struct InferMetaFnRegistrar {
 };
 
 #define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
       PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
       "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
-          #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
+          #kernel_name_prefix, PD_INFER_META(variadic_infer_meta_fn))
 
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 35e170a3fce..6a0c7bbc9b7 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -228,13 +228,13 @@ struct KernelRegistrar {
  *   http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement
  *   http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644
  */
-#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N()))
-#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
-#define _PT_ARG_N_EXPAND(                                                     \
+#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N()))
+#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__)
+#define _PD_ARG_N_EXPAND(                                                     \
     _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
   N
-#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
-#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args
+#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 /** PD_REGISTER_KERNEL
  *
@@ -256,10 +256,10 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_KERNEL(                                               \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
       PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_KERNEL must be called in global namespace.");           \
-  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
+  PD_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
                                     kernel_name,                           \
                                     backend,                               \
                                     context,                               \
@@ -270,19 +270,19 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_KERNEL_REGISTRAR_INIT(                                                 \
+  PD_KERNEL_REGISTRAR_INIT(                                                 \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__);                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 /**
@@ -299,119 +299,119 @@ struct KernelRegistrar {
  */
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+  PD_EXPAND(PD_KERNEL_REGISTRAR_INIT(                                       \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__));                                                        \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
-  _PT_KERNEL_INSTANTIATION(                                            \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PD_KERNEL_INSTANTIATION(                                            \
+      PD_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                             \
   (meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION_1(              \
+#define _PD_KERNEL_INSTANTIATION_1(              \
     meta_kernel_fn, backend, context, cpp_dtype) \
   template decltype(                             \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
-#define _PT_KERNEL_INSTANTIATION_2(                                           \
+#define _PD_KERNEL_INSTANTIATION_2(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(                                           \
+#define _PD_KERNEL_INSTANTIATION_3(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(                                           \
+#define _PD_KERNEL_INSTANTIATION_4(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(                                           \
+#define _PD_KERNEL_INSTANTIATION_5(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(                                           \
+#define _PD_KERNEL_INSTANTIATION_6(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(                                           \
+#define _PD_KERNEL_INSTANTIATION_7(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(                                           \
+#define _PD_KERNEL_INSTANTIATION_8(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(                                           \
+#define _PD_KERNEL_INSTANTIATION_9(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(                                          \
+#define _PD_KERNEL_INSTANTIATION_10(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(                                          \
+#define _PD_KERNEL_INSTANTIATION_11(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(                                          \
+#define _PD_KERNEL_INSTANTIATION_12(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(                                          \
+#define _PD_KERNEL_INSTANTIATION_13(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(                                          \
+#define _PD_KERNEL_INSTANTIATION_14(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(                                          \
+#define _PD_KERNEL_INSTANTIATION_15(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+#define PD_KERNEL_REGISTRAR_INIT(reg_type,                   \
                                  kernel_name,                \
                                  backend,                    \
                                  context,                    \
@@ -419,7 +419,7 @@ struct KernelRegistrar {
                                  args_def_fn,                \
                                  meta_kernel_fn,             \
                                  ...)                        \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT(PD_NARGS(__VA_ARGS__), \
                                       reg_type,              \
                                       kernel_name,           \
                                       backend,               \
@@ -433,7 +433,7 @@ struct KernelRegistrar {
 
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
-#define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+#define _PD_KERNEL_REGISTRAR_INIT(N,                       \
                                   reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
@@ -442,20 +442,20 @@ struct KernelRegistrar {
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
-  PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+  PD_EXPAND(PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \
     reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
     context,                                               \
     layout,                                                \
-    PT_ID,                                                 \
+    PD_ID,                                                 \
     args_def_fn,                                           \
     meta_kernel_fn,                                        \
     __VA_ARGS__))
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -464,7 +464,7 @@ struct KernelRegistrar {
                                     args_def_fn,                              \
                                     meta_kernel_fn,                           \
                                     cpp_dtype)                                \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -474,10 +474,10 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -487,7 +487,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -497,18 +497,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -518,7 +518,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -528,18 +528,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -549,7 +549,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -559,18 +559,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -580,7 +580,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -590,18 +590,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -611,7 +611,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -621,18 +621,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -642,7 +642,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -652,18 +652,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -673,7 +673,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -683,18 +683,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -704,7 +704,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -714,18 +714,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -735,7 +735,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -745,18 +745,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -766,7 +766,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -776,18 +776,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -797,7 +797,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -807,18 +807,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -828,7 +828,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -838,18 +838,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -859,7 +859,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -869,18 +869,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -890,7 +890,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -900,14 +900,14 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
@@ -924,7 +924,7 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_GENERAL_KERNEL(                                         \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
   __PD_REGISTER_GENERAL_KERNEL(                                              \
@@ -934,7 +934,7 @@ struct KernelRegistrar {
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -943,18 +943,18 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -963,13 +963,13 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
@@ -979,7 +979,7 @@ struct KernelRegistrar {
  * to avoid being removed by linker
  */
 #define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index f7fa27b0744..baa549d7a66 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -30,14 +30,15 @@
 
 namespace phi {
 
-#define PT_KERNEL(...) \
+// PD_KERNEL has been used by custom op api
+#define PHI_KERNEL(...) \
   ::phi::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
 
-#define PT_VARIADIC_KERNEL(...)                                      \
+#define PHI_VARIADIC_KERNEL(...)                                     \
   reinterpret_cast<void*>(&::phi::KernelImpl<decltype(&__VA_ARGS__), \
                                              &__VA_ARGS__>::VariadicCompute)
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
   template <typename... Tail>                                                \
   struct KernelCallHelper<const dev_ctx&, Tail...> {                         \
     template <int dev_ctx_idx,                                               \
@@ -60,7 +61,7 @@ namespace phi {
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
   template <typename... Tail>                                           \
   struct KernelCallHelper<const tensor_type&, Tail...> {                \
     template <int dev_ctx_idx,                                          \
@@ -81,7 +82,7 @@ namespace phi {
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
   template <typename... Tail>                                              \
   struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
     template <int dev_ctx_idx,                                             \
@@ -102,7 +103,7 @@ namespace phi {
     }                                                                      \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
   template <typename... Tail>                                                \
   struct KernelCallHelper<const std::vector<const tensor_type*>&, Tail...> { \
     template <int dev_ctx_idx,                                               \
@@ -124,7 +125,7 @@ namespace phi {
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                             \
   struct KernelCallHelper<attr_type, Tail...> {                           \
     template <int dev_ctx_idx,                                            \
@@ -142,7 +143,7 @@ namespace phi {
     }                                                                     \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
   template <typename... Tail>                                            \
   struct KernelCallHelper<tensor_type*, Tail...> {                       \
     template <int dev_ctx_idx,                                           \
@@ -159,7 +160,7 @@ namespace phi {
     }                                                                    \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
   template <typename... Tail>                                                 \
   struct KernelCallHelper<std::vector<tensor_type*>, Tail...> {               \
     template <int dev_ctx_idx,                                                \
@@ -204,65 +205,65 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   /* DeviceContext Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
 #endif
 
   /* Input Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
 
   /* Attribute Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
 
   /* Output Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 97c5466e1de..8049d027a77 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -26,19 +26,19 @@ namespace phi {
   classname& operator=(classname&&) = delete
 #endif
 
-#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
-  _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+#define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
 
-#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
+#define _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
 #ifdef __COUNTER__
-#define PT_ID __COUNTER__
+#define PD_ID __COUNTER__
 #else
-#define PT_ID __LINE__
+#define PD_ID __LINE__
 #endif
 
 #if defined(_WIN32)
@@ -48,9 +48,9 @@ namespace phi {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
-#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
-#define PT_CONCATENATE2(arg1, arg2) arg1##arg2
-#define PT_EXPAND(x) x
+#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
+#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
+#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PD_EXPAND(x) x
 
 }  // namespace phi
-- 
GitLab


From da3de72de952d9133df385cd5b98d729906db4a0 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 7 Mar 2022 10:05:49 +0800
Subject: [PATCH 144/272] [AMP] refine paddle.amp.decorate code example
 (#40159)

* refine amp.decorate code example

* refine code
---
 python/paddle/amp/auto_cast.py               | 6 +++---
 python/paddle/fluid/dygraph/amp/auto_cast.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9ca29d509f6..5132f23079f 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -107,9 +107,9 @@ def decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -122,7 +122,7 @@ def decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f43a51063b0..191661b7bf9 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -411,9 +411,9 @@ def amp_decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -426,7 +426,7 @@ def amp_decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
-- 
GitLab


From f5ec03147d51302d2f04a23f1b30addb8f93ab43 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 7 Mar 2022 10:35:35 +0800
Subject: [PATCH 145/272] [Phi]Migrate Adamax and Adadelta Optimizer Op into
 Phi (#40173)

* [Phi]Migrate Adamax into phi

* Add adadelta kernel
---
 .../fluid/operators/optimizers/adadelta_op.cc | 88 +++---------------
 .../fluid/operators/optimizers/adadelta_op.cu | 19 ----
 .../fluid/operators/optimizers/adadelta_op.h  | 84 -----------------
 .../fluid/operators/optimizers/adamax_op.cc   | 78 +++-------------
 .../fluid/operators/optimizers/adamax_op.cu   | 19 ----
 paddle/fluid/operators/optimizers/adamax_op.h | 82 -----------------
 paddle/phi/infermeta/multiary.cc              | 92 +++++++++++++++++++
 paddle/phi/infermeta/multiary.h               | 24 +++++
 paddle/phi/kernels/adadelta_kernel.h          | 33 +++++++
 paddle/phi/kernels/adamax_kernel.h            | 36 ++++++++
 paddle/phi/kernels/cpu/adadelta_kernel.cc     | 22 +++++
 paddle/phi/kernels/cpu/adamax_kernel.cc       | 21 +++++
 paddle/phi/kernels/gpu/adadelta_kernel.cu     | 22 +++++
 paddle/phi/kernels/gpu/adamax_kernel.cu       | 21 +++++
 .../phi/kernels/impl/adadelta_kernel_impl.h   | 65 +++++++++++++
 paddle/phi/kernels/impl/adamax_kernel_impl.h  | 69 ++++++++++++++
 16 files changed, 429 insertions(+), 346 deletions(-)
 delete mode 100644 paddle/fluid/operators/optimizers/adadelta_op.cu
 delete mode 100644 paddle/fluid/operators/optimizers/adadelta_op.h
 delete mode 100644 paddle/fluid/operators/optimizers/adamax_op.cu
 delete mode 100644 paddle/fluid/operators/optimizers/adamax_op.h
 create mode 100644 paddle/phi/kernels/adadelta_kernel.h
 create mode 100644 paddle/phi/kernels/adamax_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/adadelta_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/adamax_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/adadelta_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/adamax_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/adadelta_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/adamax_kernel_impl.h

diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index ad7f93d73e9..3cafbce04d3 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Param) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Grad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredGrad"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredUpdate"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("ParamOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(ParamOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredGradOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredUpdateOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and grad input of AdadeltaOp should have same dimension."));
-    PADDLE_ENFORCE_NE(
-        phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-        platform::errors::InvalidArgument(
-            "Maybe the Input variable AvgSquaredGrad has not "
-            "been initialized. You may need to confirm if you put "
-            "exe.run(startup_program) after optimizer.minimize "
-            "function."));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredGrad input of AdadeltaOp "
-                          "should have same dimension"));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredUpdate input of AdadeltaOp "
-                          "should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -149,7 +81,11 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
+namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor,
+                            PT_INFER_META(phi::AdadeltaInferMeta));
+REGISTER_OPERATOR(
+    adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdadeltaInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/fluid/operators/optimizers/adadelta_op.cu
deleted file mode 100644
index 562a157f063..00000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
deleted file mode 100644
index 85cfad35858..00000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdadeltaOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto avg_squared_grad_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
-    auto avg_squared_update_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T rho = static_cast<T>(ctx.Attr<float>("rho"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    // Squared gradient accumulator
-    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
-    // Squared updates accumulator
-    auto avg_squared_update = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto avg_squared_grad_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
-    auto avg_squared_update_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    avg_squared_grad_out.device(place) =
-        rho * avg_squared_grad + (1 - rho) * grad.square();
-    auto update =
-        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
-             .sqrt() *
-        grad;
-    avg_squared_update_out.device(place) =
-        rho * avg_squared_update + (1 - rho) * update.square();
-    param_out.device(place) = param + update;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index a95a37c980c..29f3d3b09de 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut",
-                   "Adamax");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 dimension"));
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Beta1 power accumulator should have 1 dimension"));
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and Grad input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        platform::errors::InvalidArgument(
-            "Param and Moment input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("InfNorm"),
-        platform::errors::InvalidArgument(
-            "Param and InfNorm input of AdamaxOp should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-    ctx->SetOutputDim("InfNormOut", param_dims);
-  }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -150,7 +92,11 @@ division by 0 error.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
+DELCARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor,
+                            PT_INFER_META(phi::AdamaxInferMeta));
+
+REGISTER_OPERATOR(
+    adamax, ops::AdamaxOp, ops::AdamaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdamaxInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/fluid/operators/optimizers/adamax_op.cu
deleted file mode 100644
index 80e0219d441..00000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
deleted file mode 100644
index df0112448b1..00000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamaxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto inf_norm = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("InfNorm"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto inf_norm_out =
-        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(*place) =
-        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    auto lr_t = lr / (1 - beta1_pow);
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(*place) =
-        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index dc5478e8afb..a21f077c09f 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -28,6 +28,98 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 51738c5e08e..8cb6f70481d 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -39,4 +39,28 @@ void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
                     MetaTensor* out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h
new file mode 100644
index 00000000000..65a6aad4151
--- /dev/null
+++ b/paddle/phi/kernels/adadelta_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adamax_kernel.h b/paddle/phi/kernels/adamax_kernel.h
new file mode 100644
index 00000000000..feaf996f162
--- /dev/null
+++ b/paddle/phi/kernels/adamax_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/adadelta_kernel.cc b/paddle/phi/kernels/cpu/adadelta_kernel.cc
new file mode 100644
index 00000000000..e9b5397b616
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adadelta_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, CPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/adamax_kernel.cc b/paddle/phi/kernels/cpu/adamax_kernel.cc
new file mode 100644
index 00000000000..867c900e70b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adamax_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, CPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu
new file mode 100644
index 00000000000..7516a277a74
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
new file mode 100644
index 00000000000..0817c531318
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
new file mode 100644
index 00000000000..3fbdf435bab
--- /dev/null
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(avg_squared_grad_out);
+  dev_ctx.template Alloc<T>(avg_squared_update_out);
+
+  T rho_ = static_cast<T>(rho);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  // Squared gradient accumulator
+  auto eigen_avg_squared_grad = EigenVector<T>::Flatten(avg_squared_grad);
+  // Squared updates accumulator
+  auto eigen_avg_squared_update = EigenVector<T>::Flatten(avg_squared_update);
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_avg_squared_grad_out =
+      EigenVector<T>::Flatten(*avg_squared_grad_out);
+  auto eigen_avg_squared_update_out =
+      EigenVector<T>::Flatten(*avg_squared_update_out);
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_avg_squared_grad_out.device(place) =
+      rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square();
+  auto update = -((eigen_avg_squared_update + epsilon_) /
+                  (eigen_avg_squared_grad_out + epsilon_))
+                     .sqrt() *
+                eigen_grad;
+  eigen_avg_squared_update_out.device(place) =
+      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
+  eigen_param_out.device(place) = eigen_param + update;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adamax_kernel_impl.h b/paddle/phi/kernels/impl/adamax_kernel_impl.h
new file mode 100644
index 00000000000..bff553319a2
--- /dev/null
+++ b/paddle/phi/kernels/impl/adamax_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(moment_out);
+  dev_ctx.template Alloc<T>(inf_norm_out);
+
+  T beta1_ = static_cast<T>(beta1);
+  T beta2_ = static_cast<T>(beta2);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  auto eigen_moment = EigenVector<T>::Flatten(moment);
+  auto eigen_inf_norm = EigenVector<T>::Flatten(inf_norm);
+  auto eigen_lr = EigenVector<T>::Flatten(learning_rate);
+  auto eigen_beta1_pow = EigenVector<T>::Flatten(beta1_pow);
+
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_moment_out = EigenVector<T>::Flatten(*moment_out);
+  auto eigen_inf_norm_out = EigenVector<T>::Flatten(*inf_norm_out);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_moment_out.device(place) =
+      beta1_ * eigen_moment + (1 - beta1_) * eigen_grad;
+  eigen_inf_norm_out.device(place) =
+      eigen_grad.abs().cwiseMax((beta2_ * eigen_inf_norm) + epsilon_);
+  auto lr_t = eigen_lr / (1 - eigen_beta1_pow);
+  Eigen::DSizes<int, 1> m_dsize(moment_out->numel());
+  eigen_param_out.device(place) =
+      eigen_param -
+      lr_t.broadcast(m_dsize) * (eigen_moment_out / eigen_inf_norm_out);
+}
+
+}  // namespace phi
-- 
GitLab


From 0ad25fb923a7e8426e1c34e3e5d2bcabb90b2233 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Mon, 7 Mar 2022 10:49:53 +0800
Subject: [PATCH 146/272] initialize processgroupnccl with store (#40181)

---
 .../collective/ProcessGroupNCCL.cc            | 45 ++++++++-----------
 .../distributed/collective/ProcessGroupNCCL.h |  5 ++-
 paddle/fluid/distributed/store/store.h        | 23 +++++++---
 paddle/fluid/pybind/communication.cc          | 36 ++++++++++++---
 paddle/fluid/pybind/distributed_py.cc         | 44 +-----------------
 .../tests/unittests/process_group_nccl.py     | 19 +++-----
 6 files changed, 75 insertions(+), 97 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 88d8fb69eb6..67715f410d4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -156,36 +156,27 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
 // Same as Wait
 void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
-ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
+ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
                                    int rank, int size)
-    : ProcessGroup(rank, size), strategy_(strategy) {}
-
-void ProcessGroupNCCL::BcastNCCLId(
-    std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
-    int root, int server_fd) {
-  if (strategy_.local_rank_ == root) {
-    std::vector<std::string> other_trainers;
-    for (auto& ep : strategy_.trainer_endpoints_) {
-      if (ep != strategy_.current_endpoint_) {
-        other_trainers.push_back(ep);
-      }
-    }
-    platform::SendBroadCastCommID(other_trainers, &nccl_ids);
-  } else {
-    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
-                                  &nccl_ids);
-  }
-}
+    : ProcessGroup(rank, size), store_(store) {}
 
 void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
-
-  int server_fd = -1;
-  if (rank_ != 0) {
-    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
-                    .socket();
+  if (rank_ == 0) {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto nccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
+      store_->set(key, nccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&nccl_ids[i], ret.data(), ret.size());
+    }
   }
-  BcastNCCLId(nccl_ids, 0, server_fd);
 }
 
 // create NCCLManager cache for places_key
@@ -213,8 +204,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   }
   BroadcastUniqueNCCLID(nccl_ids);
 
-  VLOG(3) << "init nccl rank: " << strategy_.local_rank_
-          << ", nranks: " << strategy_.nranks_ << ", place: " << places_key
+  VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
           << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
 
   std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index d63a5e76838..aa2a2b8fa20 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -75,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup {
    private:
   };
 
-  ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size);
 
   const std::string GetBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
@@ -118,7 +119,7 @@ class ProcessGroupNCCL : public ProcessGroup {
       const std::vector<Tensor>& inputs);
 
  protected:
-  ProcessGroupStrategy strategy_;
+  std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
   std::mutex mutex_;
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
index 2581a74d7e8..7b4ae7e70ff 100644
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -25,15 +25,26 @@ namespace distributed {
 
 class Store {
  public:
-  Store() = delete;
+  Store() : _timeout(tcputils::kNoTimeout) {}
   explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
   virtual ~Store() = default;
 
-  virtual int64_t add(const std::string& key, int64_t value) = 0;
-  virtual std::vector<uint8_t> get(const std::string& key) = 0;
-  virtual void wait(const std::string& key) = 0;
-  virtual void set(const std::string& key,
-                   const std::vector<uint8_t>& value) = 0;
+  virtual int64_t add(const std::string& key, int64_t value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual std::vector<uint8_t> get(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void wait(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void set(const std::string& key, const std::vector<uint8_t>& value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
 
   virtual const std::chrono::seconds& timeout() const { return _timeout; }
 
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index c01accaf598..1a6a395545a 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -30,18 +30,42 @@ namespace pybind {
 
 using TCPStore = paddle::distributed::TCPStore;
 
-void BindTCPStore(py::module* m) {
-  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
+void BindTCPStore(py::module *m) {
+  auto Store =
+      py::class_<distributed::Store, std::shared_ptr<distributed::Store>>(
+          *m, "Store")
+          .def(py::init<>())
+          .def("set",
+               [](distributed::Store &self, const std::string &key,
+                  const std::string &value) {
+                 std::vector<uint8_t> data(value.begin(), value.end());
+                 self.set(key, data);
+               },
+               py::arg("key"), py::arg("value"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("get",
+               [](distributed::Store &self,
+                  const std::string &key) -> py::bytes {
+                 auto data = self.get(key);
+                 return py::bytes(reinterpret_cast<char *>(data.data()),
+                                  data.size());
+               },
+               py::arg("key"), py::call_guard<py::gil_scoped_release>())
+          .def("add", &distributed::Store::add,
+               py::call_guard<py::gil_scoped_release>())
+          .def("wait", &distributed::Store::wait,
+               py::call_guard<py::gil_scoped_release>());
+
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore", Store)
       .def(py::init([](std::string hostname, uint16_t port, bool is_master,
                        size_t world_size, std::chrono::seconds timeout) {
              return std::make_shared<TCPStore>(hostname, port, is_master,
                                                world_size, timeout);
            }),
            py::arg("hostname"), py::arg("port"), py::arg("is_master"),
-           py::arg("world_size"), py::arg("timeout"),
-           py::call_guard<py::gil_scoped_release>())
-      .def("add", &TCPStore::add)
-      .def("get", &TCPStore::get);
+           py::arg("world_size"),
+           py::arg("timeout") = distributed::tcputils::kNoTimeout,
+           py::call_guard<py::gil_scoped_release>());
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 17512863357..9870eab8da9 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -197,7 +197,7 @@ void BindDistributed(py::module *m) {
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
            py::call_guard<py::gil_scoped_release>());
 #endif
 
@@ -210,44 +210,6 @@ void BindDistributed(py::module *m) {
       .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
            py::call_guard<py::gil_scoped_release>());
 
-  // define parallel strategy, it will be removed
-  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
-      *m, "ProcessGroupStrategy", "");
-  pg_strategy.def(py::init())
-      .def_property("nranks",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nranks_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nranks) {
-                      self.nranks_ = nranks;
-                    })
-      .def_property("local_rank",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       int local_rank) { self.local_rank_ = local_rank; })
-      .def_property(
-          "trainer_endpoints",
-          [](const distributed::ProcessGroupStrategy &self) {
-            return self.trainer_endpoints_;
-          },
-          [](distributed::ProcessGroupStrategy &self,
-             std::vector<std::string> eps) { self.trainer_endpoints_ = eps; })
-      .def_property("current_endpoint",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; })
-      .def_property("nrings",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nrings_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nrings) {
-                      self.nrings_ = nrings;
-                    });
-
 #if defined(PADDLE_WITH_GLOO)
   py::class_<GlooOptions>(*m, "GlooOptions")
       .def(py::init<>())
@@ -279,9 +241,7 @@ void BindDistributed(py::module *m) {
              return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
                                                        opts);
            }),
-           py::arg("store"), py::arg("rank"),
-           py::arg("world_size"),  // py::arg("timeout") =
-                                   // kProcessGroupDefaultTimeout,
+           py::arg("store"), py::arg("rank"), py::arg("world_size"),
            py::call_guard<py::gil_scoped_release>())
       .def_static("create_default_device",
                   &ProcessGroupGloo::createDefaultDevice);
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 4833cea9a8d..b1da0777feb 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -27,22 +27,13 @@ import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
-ProcessGroupStrategy = core.ProcessGroupStrategy
-
 
 def init_process_group(strategy=None):
-    # this will remove
-    if strategy is None:
-        strategy = ProcessGroupStrategy()
-        strategy.nranks = ParallelEnv().nranks
-        strategy.local_rank = ParallelEnv().local_rank
-        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-        strategy.current_endpoint = ParallelEnv().current_endpoint
-    if strategy.nranks < 2:
-        return
-
-    pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank,
-                                     strategy.nranks)
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupNCCL(store, rank, nranks)
 
     return pg_group
 
-- 
GitLab


From b4eb413efaa109359ca75abdb02bf27ccaf8deb9 Mon Sep 17 00:00:00 2001
From: zn <96479180+kangna-qi@users.noreply.github.com>
Date: Mon, 7 Mar 2022 11:33:23 +0800
Subject: [PATCH 147/272] [MLU]support reduce tensors on mlu (#40000)

* [MLU]support reduce tensors on mlu

* [MLU]fix compiler options
---
 paddle/fluid/imperative/CMakeLists.txt        |   3 +-
 paddle/fluid/imperative/reducer.cc            |  87 ++++++++++++++-
 paddle/fluid/imperative/reducer.h             |   2 +-
 paddle/fluid/imperative/tests/CMakeLists.txt  |   2 +-
 paddle/fluid/imperative/tests/test_group.cc   |  20 +++-
 paddle/fluid/operators/math/CMakeLists.txt    |   2 +
 .../fluid/operators/math/concat_and_split.cc  | 100 ++++++++++++++++++
 paddle/fluid/operators/mlu/mlu_baseop.cc      |  42 ++++++++
 paddle/fluid/operators/mlu/mlu_baseop.h       |  11 ++
 paddle/fluid/operators/strided_memcpy.h       |   5 +
 10 files changed, 265 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index e1ce705533a..3d8a5ab21f0 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -33,6 +33,7 @@ if(NOT WIN32)
     endif()
     if(WITH_CNCL)
         cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+	cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
     if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
         cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
@@ -41,7 +42,7 @@ if(NOT WIN32)
 endif(NOT WIN32)
 if(WITH_GLOO)
     cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits)
-    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) ))
+    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) ))
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
 endif()
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 3a6365b2af2..fec9afbf3b4 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -31,7 +31,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #ifdef PADDLE_WITH_XPU_BKCL
 // TODO(liuyuhui) support xpu about div nranks in the future
 #endif
+  } else if (platform::is_mlu_place(tensor->place())) {
+    // TODO(zhangna)
+    VLOG(4) << "divnrank for mlu not support yet";
   }
 }
 
@@ -222,6 +225,56 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
+#ifdef PADDLE_WITH_CNCL
+// context is used to select the stream for concat
+template <>
+void ConcatTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    const std::vector<framework::Tensor> &dense_tensors_,
+    framework::Variable *p_dense_contents,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case framework::proto::VarType::FP32:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+
+// context is used to select the stream for split
+template <>
+void SplitTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case framework::proto::VarType::FP32:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+#endif
+
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
@@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't concat npu grads since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    ConcatTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        dense_tensors_, &dense_contents_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat mlu grads since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     ConcatTensorsWithType(
@@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't split npu grad since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    SplitTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        &dense_contents_, &dense_tensors_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split mlu grad since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     SplitTensorsWithType(
@@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
         // TODO(liuyuhui) support XPU set constant
         VLOG(3) << "XPU doesn't support set_constant";
       }
+#elif defined(PADDLE_WITH_CNCL)
+      if (platform::is_mlu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support MLU set constant
+        VLOG(3) << "MLU doesn't support set_constant";
+      }
 #else
       auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
       if (HasGrad(var_index)) {
@@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) {
         cv_.notify_all();
       }
     });
-#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
+#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) ||    \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_CNCL)
     FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Not compiled with BKCL or NCCL or GLOO."));
+        "Not compiled with BKCL or NCCL or CNCL or GLOO."));
 #endif
   }
 }
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index cca773b840c..9fac4b41cbd 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -45,7 +45,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index e4f1cfdb3ba..09de0106ed6 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
 cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
 cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op)
-if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
+if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL)
 cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 6c304278d21..5e674af1a08 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) {
       value.push_back(static_cast<T>(1.0 * j));
     }
 
-    if (std::is_same<Place, platform::CUDAPlace>::value) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (std::is_same<Place, platform::CUDAPlace>::value ||
+        std::is_same<Place, platform::MLUPlace>::value) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_CNCL)
       paddle::memory::Copy(place, data, cpu_place, value.data(),
                            sizeof(T) * value.size(), 0);
 #endif
@@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) {
 }
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+TEST(TestGroup, TestMLUConcatSplit) {
+  platform::MLUPlace mlu_place(0);
+  platform::CPUPlace cpu_place;
+
+  int size = 3;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+
+  size = 15;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 14b12ca3acb..bce927c32dd 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -5,6 +5,8 @@ endif()
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
 math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
+elseif (WITH_MLU)
+math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
 math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 46126ac59c8..c9308d27c0a 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
@@ -226,6 +229,90 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
 };
 #endif
 
+#ifdef PADDLE_WITH_MLU
+template <typename T>
+class ConcatFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, int axis,
+                  framework::Tensor* output) {
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto ins_size = input.size();
+
+    const int axis_t = axis;
+    const int ins_size_t = ins_size;
+    auto place = context.GetPlace();
+    output->mutable_data<T>(place);
+
+    // mlu should do sth
+    // init ins tensors
+    std::vector<const void*> inputs;
+    std::vector<MLUCnnlTensorDesc> input_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < ins_size; i++) {
+      input_descs.emplace_back(MLUCnnlTensorDesc(
+          input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
+      desc_vector.push_back(input_descs.back().get());
+      inputs.push_back(input[i].data());
+    }
+    // init out tensors
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(),
+                    inputs.data(), output_desc.get(), GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class SplitFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
+    if (input.numel() == 0) {
+      return;
+    }
+
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto in_dims = input.dims();
+    auto out_size = outputs->size();
+
+    std::vector<framework::DDim> outs_dims(out_size, in_dims);
+    for (size_t i = 0; i < out_size; ++i) {
+      outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
+    }
+
+    // init out tensors
+    std::vector<void*> vct_tensor;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < out_size; i++) {
+      (*outputs)[i]->Resize(outs_dims[i]);
+      (*outputs)[i]->mutable_data<T>(context.GetPlace());
+      output_descs.emplace_back(
+          MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
+                            ToCnnlDataType((*outputs)[i]->dtype())));
+      desc_vector.push_back(output_descs.back().get());
+      vct_tensor.push_back(GetBasePtr((*outputs)[i]));
+    }
+    // init in tensors
+    MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input.dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(),
+                   desc_vector.data(), vct_tensor.data());
+  }
+};
+#endif
+
 #define DEFINE_FUNCTOR(type)                                      \
   template class ConcatFunctor<platform::CPUDeviceContext, type>; \
   template class SplitFunctor<platform::CPUDeviceContext, type>;
@@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float)
 FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
 #endif
 
+#ifdef PADDLE_WITH_MLU
+#define DEFINE_MLU_FUNCTOR(type)                                  \
+  template class ConcatFunctor<platform::MLUDeviceContext, type>; \
+  template class SplitFunctor<platform::MLUDeviceContext, type>;
+DEFINE_MLU_FUNCTOR(float)
+DEFINE_MLU_FUNCTOR(platform::float16)
+DEFINE_MLU_FUNCTOR(int64_t)
+DEFINE_MLU_FUNCTOR(bool)
+DEFINE_MLU_FUNCTOR(int)
+DEFINE_MLU_FUNCTOR(int8_t)
+DEFINE_MLU_FUNCTOR(int16_t)
+DEFINE_MLU_FUNCTOR(uint8_t)
+#endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 9de03582cbb..1fdaa153e3c 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
+/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx,
+                                  const int pack_num, const int axis,
+                                  const cnnlTensorDescriptor_t inputs_desc[],
+                                  const void* const inputs[],
+                                  const cnnlTensorDescriptor_t output_desc,
+                                  void* output) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc,
+                                        inputs, workspace_ptr, workspace_size,
+                                        output_desc, output));
+}
+
 /* static */ void MLUCnnl::Div(
     const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
     const cnnlTensorDescriptor_t in0_desc, const void* in0,
@@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                        output_descs, output_ptrs));
 }
 
+/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num,
+                                 int axis,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input_ptr,
+                                 const cnnlTensorDescriptor_t output_descs[],
+                                 void* output_ptrs[]) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc,
+                                       input_ptr, workspace_ptr, workspace_size,
+                                       output_descs, output_ptrs));
+}
+
 /* static */ void MLUCnnl::GatherFunctor(
     const ExecutionContext& ctx, const int axis, const int batch_dims,
     const cnnlTensorDescriptor_t params_desc, const void* params,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 2a54a8392c7..b55b10686e9 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -403,6 +403,11 @@ class MLUCnnl {
                      const void* const inputs[],
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num,
+                     const int axis, const cnnlTensorDescriptor_t inputs_desc[],
+                     const void* const inputs[],
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type,
                    const cnnlTensorDescriptor_t input_desc, const void* input,
                    const cnnlTensorDescriptor_t output_desc, void* output);
@@ -566,6 +571,12 @@ class MLUCnnl {
                     const cnnlTensorDescriptor_t output_descs[],
                     void* output_ptrs[]);
 
+  static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis,
+                    const cnnlTensorDescriptor_t input_desc,
+                    const void* input_ptr,
+                    const cnnlTensorDescriptor_t output_descs[],
+                    void* output_ptrs[]);
+
   static void Scale(const ExecutionContext& ctx, const int axis,
                     const cnnlTensorDescriptor_t input_desc, const void* input,
                     const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index c92d468f346..af29aac6b90 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
       memory::Copy(npu_place, dst + i * dst_after, npu_place,
                    src + i * src_after, sizeof(T) * size, npu_ctx.stream());
+#elif defined(PADDLE_WITH_MLU)
+      auto& mlu_place = place;
+      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
+      memory::Copy(mlu_place, dst + i * dst_after, mlu_place,
+                   src + i * src_after, sizeof(T) * size, mlu_ctx.stream());
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "Paddle is not compiled with GPU."));
-- 
GitLab


From 98c427e2a5584191507c1bdce8baa0e9fc1dd88e Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 7 Mar 2022 11:38:43 +0800
Subject: [PATCH 148/272] [bf16] add bf16 kernel: sigmoid & sqrt & softplus &
 square (#40004)

* add activ

* refine unittest

* refine unittest

* refine unittest

* refine unittest

* refine code
---
 paddle/fluid/operators/activation_op.kps      |  31 +++--
 paddle/fluid/operators/amp/fp16_type_traits.h |   7 ++
 paddle/fluid/operators/dropout_impl.cu.h      |   3 +-
 paddle/phi/common/bfloat16.h                  |   4 +
 .../tests/unittests/test_activation_op.py     | 113 ++++++++++++++++++
 5 files changed, 150 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index e1afb3919f8..3b7ce9eaf2b 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -1509,7 +1509,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<double>>,                         \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1517,7 +1519,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<double>>,                \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
                                             grad_functor)                      \
@@ -1531,7 +1535,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<int64_t>>,                        \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1543,7 +1549,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<int64_t>>,               \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 /* ======================== leaky relu register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
@@ -1650,7 +1658,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidGradGradFunctor<double>>,
     ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+                                 ops::SigmoidGradGradFunctor<plat::float16>>,
+    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     sigmoid_triple_grad,
@@ -1659,7 +1669,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidTripleGradFunctor<double>>,
     ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
+                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
+    ops::SigmoidTripleGradKernel<
+        plat::CUDADeviceContext,
+        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
@@ -1696,7 +1709,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
                               ops::SqrtGradGradFunctor<double>>,
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
+                              ops::SqrtGradGradFunctor<plat::float16>>,
+    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::SqrtGradGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================   rsqrt register  =============================
@@ -1726,6 +1741,8 @@ REGISTER_OP_CUDA_KERNEL(
                                 ops::SquareGradGradFunctor<double>>,
     ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<plat::float16>>,
+    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
+                                ops::SquareGradGradFunctor<plat::bfloat16>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<int>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h
index f7aa0de9759..56aebe90788 100644
--- a/paddle/fluid/operators/amp/fp16_type_traits.h
+++ b/paddle/fluid/operators/amp/fp16_type_traits.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -32,6 +33,12 @@ class MPTypeTrait<platform::float16> {
   using Type = float;
 };
 
+template <>
+class MPTypeTrait<platform::bfloat16> {
+ public:
+  using Type = float;
+};
+
 }  // namespace details
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index dcdab033e8f..17665ad67e4 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -266,7 +266,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                           cudaMemcpyDeviceToDevice, stream));
 #endif
     } else {
-      T factor = static_cast<T>(1.0f - dropout_prob);
+      using MT = typename details::MPTypeTrait<T>::Type;
+      MT factor = static_cast<MT>(1.0f - dropout_prob);
       std::vector<const framework::Tensor*> ins = {&x};
       std::vector<framework::Tensor*> outs = {y};
       auto functor = phi::funcs::ScaleFunctor<T>(factor);
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index cf99bb8f19a..5f30ee4077b 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -310,6 +310,10 @@ HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+HOSTDEVICE inline bfloat16(abs)(const bfloat16& a) {
+  return bfloat16(std::abs(static_cast<float>(a)));
+}
+
 inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
   os << static_cast<float>(a);
   return os;
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index b4b5944e27c..5c40b898d23 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -183,6 +183,34 @@ class TestSigmoid(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSigmoidBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+        out = 1 / (1 + np.exp(-x))
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestSilu(TestActivation):
     def setUp(self):
         self.op_type = "silu"
@@ -945,6 +973,34 @@ class TestSqrt(TestActivation, TestParameter):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSqrtBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        np.random.seed(1023)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.sqrt(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestRsqrt(TestActivation):
     def setUp(self):
         self.op_type = "rsqrt"
@@ -2195,6 +2251,34 @@ class TestSquare(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSquareBF16(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.square(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5)
+
+
 class TestPow(TestActivation):
     def setUp(self):
         self.op_type = "pow"
@@ -2433,6 +2517,35 @@ class TestSoftplus(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftplusBF16(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.init_dtype()
+
+        beta = 2
+        threshold = 15
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [10, 12]).astype(np.float32)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'beta': beta, "threshold": threshold}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.05)
+
+
 class TestSoftplusAPI(unittest.TestCase):
     # test paddle.nn.Softplus, paddle.nn.functional.softplus
     def setUp(self):
-- 
GitLab


From 71cb016cad456c6a314885ee0087dac05db03dbe Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Mon, 7 Mar 2022 11:43:47 +0800
Subject: [PATCH 149/272] [AutoParallel]engine support pp (#40084)

* engine support pp

* fix format

* avoid multi print

* fix convert

* bug fix

* add pp unittest
---
 .../distributed/auto_parallel/engine.py       |  24 +++-
 .../paddle/distributed/auto_parallel/utils.py |   9 +-
 python/paddle/distributed/utils.py            |  16 ++-
 .../unittests/auto_parallel/CMakeLists.txt    |   3 +-
 .../unittests/auto_parallel/engine_api.py     | 132 ++++++++++++++++++
 .../auto_parallel/test_engine_api.py          | 127 +++--------------
 6 files changed, 186 insertions(+), 125 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py

diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 8efb9eb7192..56beb895741 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -99,11 +99,11 @@ class Engine:
             all_ranks = world_process_group.ranks
             for rank in all_ranks:
                 self._parallel(rank)
-        place = _get_device()
-        if isinstance(place, fluid.CUDAPlace):
+        self._place = _get_device()
+        if isinstance(self._place, fluid.CUDAPlace):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
         if self._executor is None:
-            self._executor = fluid.Executor(place)
+            self._executor = paddle.static.Executor(self._place)
 
     def _build(self):
         serial_main_prog = self._serial_main_progs.get(self.mode, None)
@@ -119,12 +119,13 @@ class Engine:
             labels = [s._create_feed_layer() for s in to_list(labels_spec)]
             self._input_vars = inputs
             self._label_vars = labels
-            feed_list = self._input_vars + self._label_vars
+            self._feed_vars = self._input_vars + self._label_vars
             outputs = to_list(self.model(*inputs))
             if self.mode != "predict" and self.loss:
                 loss = self.loss(*(outputs + labels))
                 self._loss_var = loss
 
+        self._fetch_vars = {"outputs": outputs, "loss": loss}
         self._serial_main_progs[self.mode] = serial_main_prog
         self._serial_startup_progs[self.mode] = serial_startup_prog
         self._dist_contexts[self.mode] = DistributedContext(
@@ -278,19 +279,32 @@ class Engine:
         dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
         dist_context = self._dist_contexts[self.mode]
         dist_main_block = dist_main_prog.global_block()
+        serial_main_prog = self._serial_main_progs[self.mode]
+        serial_main_block = serial_main_prog.global_block()
         op_size = len(dist_main_block.ops)
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
             dataloader = NonIterableGeneratorLoader(
                 dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
         new_op_size = len(dist_main_block.ops)
-        for idx in range(new_op_size - 1, op_size - 1, -1):
+        for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(op.desc)
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
+            for in_name in new_op.input_arg_names:
+                if in_name == "lod_tensor_blocking_queue_0":
+                    continue
+                if in_name not in dist_main_block.vars:
+                    in_var = serial_main_block._var_recursive(in_name)
+                    dist_main_block._clone_variable(in_var, in_var.persistable)
+            for out_name in new_op.output_arg_names:
+                if out_name not in dist_main_block.vars:
+                    out_var = serial_main_block._var_recursive(out_name)
+                    dist_main_block._clone_variable(out_var,
+                                                    out_var.persistable)
             dist_op = DistributedOperator(new_op)
             dist_context.add_dist_op_for_program(dist_op)
         for _ in range(new_op_size - op_size):
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 75e0ae251ef..241eadcbace 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -22,7 +22,6 @@ import logging
 from functools import reduce
 
 import paddle.fluid.core as core
-from paddle.framework.io import _to_LodTensor
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.fluid.io import is_parameter, is_belong_to_optimizer
 from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
@@ -739,7 +738,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             rank_id = paddle.distributed.get_rank()
             index = cur_attr["process_group"].index(rank_id)
             param = dist_param_dict[var_name][index]
-            dist_param_dict[var_name] = _to_LodTensor(param)
+            dist_param_dict[var_name] = param
             continue
 
         pre_param = dist_param_dict[var_name]
@@ -751,7 +750,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             dist_param_dict[var_name] = complete_param
         else:
             complete_param = pre_param[0]
-            dist_param_dict[var_name] = _to_LodTensor(complete_param)
+            dist_param_dict[var_name] = complete_param
 
         if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
             sliced_param = _slice_parameter_with_dist_attr(complete_param,
@@ -798,7 +797,7 @@ def _merge_parameter_with_dist_attr(param_list, dist_attr):
 
     assert len(partition_param_list) == 1 or not partition_param_list, \
         "Fail to merge parameter"
-    complete_param = _to_LodTensor(partition_param_list[0][0])
+    complete_param = partition_param_list[0][0]
     return complete_param
 
 
@@ -818,7 +817,7 @@ def _slice_parameter_with_dist_attr(param, dist_attr):
     rank_id = paddle.distributed.get_rank()
     sliced_param_index = _get_sliced_param_index(
         rank_id, param.shape, dims_mapping, process_shape, process_group)
-    sliced_param = _to_LodTensor(sliced_param_list[sliced_param_index])
+    sliced_param = sliced_param_list[sliced_param_index]
     return sliced_param
 
 
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 53f4a93f648..ae40a42e9d5 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -546,13 +546,15 @@ class Pod(object):
 
 def get_logger(log_level, name="root"):
     logger = logging.getLogger(name)
-    logger.setLevel(log_level)
-
-    log_handler = logging.StreamHandler()
-    log_format = logging.Formatter(
-        '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
-    log_handler.setFormatter(log_format)
-    logger.addHandler(log_handler)
+    # Avoid printing multiple logs
+    if not logger.handlers:
+        logger.setLevel(log_level)
+
+        log_handler = logging.StreamHandler()
+        log_format = logging.Formatter(
+            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
 
     return logger
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 0a9eaf34ba5..80bc206ae7b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -5,7 +5,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
-    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
+    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
+    set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
new file mode 100644
index 00000000000..8c71c792bf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+PP_MESH_0 = auto.ProcessMesh([0])
+PP_MESH_1 = auto.ProcessMesh([1])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = auto.shard_op(
+            self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0]
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = auto.shard_op(
+            self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0]
+        out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+def train():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+
+    dataset = MyDataset(batch_num * batch_size)
+    data_spec = [
+        InputSpec([batch_size, hidden_size], 'float32', 'x'),
+        InputSpec([batch_size], 'int64', 'label')
+    ]
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    engine = Engine(mlp, data_spec, strategy=dist_strategy)
+    engine.prepare(optimizer, loss)
+    engine.fit(dataset,
+               batch_size=batch_size,
+               steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index 0fc1ea41033..a7d51a7e176 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,122 +13,35 @@
 # limitations under the License.
 
 import unittest
-import time
-import paddle.fluid as fluid
-import copy
 import os
-import numpy as np
+import sys
+import shutil
 import subprocess
-import paddle
-import paddle.nn as nn
-import paddle.fluid as fluid
-import paddle.static as static
-import paddle.nn.functional as F
-import paddle.utils as utils
-from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
-
-paddle.enable_static()
-global_process_mesh = auto.ProcessMesh(mesh=[0])
-batch_size = 1
-batch_num = 10
-hidden_size = 1024
-sequence_len = 512
-image_size = hidden_size
-class_num = 10
-
-paddle.seed(44)
-
-
-class MyDataset(Dataset):
-    def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
-        self.num_samples = num_samples
-
-    def __getitem__(self, index):
-        input = np.random.uniform(size=image_size).astype("float32")
-        label = np.random.randint(0, class_num - 1, dtype="int64")
-        return input, label
-
-    def __len__(self):
-        return self.num_samples
-
-
-class MLPLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
-        super(MLPLayer, self).__init__()
-        d_model = hidden_size
-        dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
-        bias_attr = None
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
-        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
-
-    def forward(self, input):
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mappig": [-1]
-            })
-        # out = self.norm(input)
-        out = self.linear0(input)
-        out = F.gelu(out, approximate=True)
-        out = self.linear1(out)
-        # out = self.dropout(out)
-        out = self.linear2(out)
-        return out
+from paddle.distributed.fleet.launch_utils import run_with_coverage
 
 
 class TestEngineAPI(unittest.TestCase):
     def test_engine_api(self):
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
-        loss = paddle.nn.CrossEntropyLoss()
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "engine_api.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
 
-        dataset = MyDataset(batch_num * batch_size)
-        data_spec = [
-            InputSpec([batch_size, hidden_size], 'float32', 'x'),
-            InputSpec([batch_size], 'int64', 'label')
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
         ]
 
-        dist_strategy = fleet.DistributedStrategy()
-        dist_strategy.amp = False
-        dist_strategy.pipeline = False
-        dist_strategy.recompute = False
-        # init parallel optimizer
-        dist_strategy.semi_auto = True
-        fleet.init(is_collective=True, strategy=dist_strategy)
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
 
-        engine = Engine(mlp, data_spec, strategy=dist_strategy)
-        engine.prepare(optimizer, loss)
-        engine.fit(dataset,
-                   batch_size=batch_size,
-                   steps_per_epoch=batch_num * batch_size)
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
 
 
 if __name__ == "__main__":
-- 
GitLab


From fd36ede6d89c1d5397e6b351e020ffbbad0ed6a7 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Mon, 7 Mar 2022 12:15:34 +0800
Subject: [PATCH 150/272] [phi] move multi_dot OP (#40038)

* [phi] move multi_dot OP

* fix the segment bug

* fix bug

* delete useless comment

* fix CI bug
---
 paddle/fluid/operators/multi_dot_op.cc        | 397 ---------------
 .../phi/kernels/cpu/multi_dot_grad_kernel.cc  |  22 +
 paddle/phi/kernels/cpu/multi_dot_kernel.cc    |  22 +
 .../phi/kernels/gpu/multi_dot_grad_kernel.cu  |  30 ++
 paddle/phi/kernels/gpu/multi_dot_kernel.cu    |  25 +
 .../phi/kernels/impl/multi_dot_kernel_impl.h  | 456 ++++++++++++++++++
 paddle/phi/kernels/multi_dot_grad_kernel.h    |  27 ++
 paddle/phi/kernels/multi_dot_kernel.h         |  26 +
 paddle/phi/ops/compat/multi_dot_sig.cc        |  27 ++
 9 files changed, 635 insertions(+), 397 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/multi_dot_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/multi_dot_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/multi_dot_kernel_impl.h
 create mode 100644 paddle/phi/kernels/multi_dot_grad_kernel.h
 create mode 100644 paddle/phi/kernels/multi_dot_kernel.h
 create mode 100644 paddle/phi/ops/compat/multi_dot_sig.cc

diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index fe4609b3ad9..b309e1b87ef 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape(
   return out_dim;
 }
 
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
-                                const framework::Tensor& matrix_a,
-                                const framework::Tensor& matrix_b,
-                                const framework::DDim& a_dim,
-                                const framework::DDim& b_dim) {
-  auto place = ctx.GetPlace();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-  framework::Tensor matrix_c;
-  framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
-  matrix_c.Resize(c_dim);
-  matrix_c.mutable_data<T>(place);
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
-  const T alpha = static_cast<T>(1.0);
-  blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
-  return matrix_c;
-}
-
-/**
- * @brief Recursively calculate matrix multiplication according to the optimal
- * order
- * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
- *
- * @param
- * ins: the input tensors
- * ins_dims: the shape of ins after reshape
- * order: the optimal order
- * i: the left of sub chain
- * j: the righe of sub chain
- * save_result: set true by backward
- * results: save the intermediate result during backward
- */
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatChainMul(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims,
-    const std::vector<uint64_t>& order, const uint64_t i, const uint64_t j,
-    const bool save_result, std::vector<framework::Tensor>* results) {
-  if (i == j) {
-    return *ins[i];
-  }
-
-  const auto A = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, i,
-                                               order[i * ins.size() + j],
-                                               save_result, results);
-  framework::DDim a_dim = A.dims();
-  if (i == order[i * ins.size() + j]) {
-    a_dim = ins_dims[i];
-  }
-
-  const auto B = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order,
-                                               order[i * ins.size() + j] + 1, j,
-                                               save_result, results);
-  framework::DDim b_dim = B.dims();
-  if (j == order[i * ins.size() + j] + 1) {
-    b_dim = ins_dims[j];
-  }
-
-  auto result = MatMul<DeviceContext, T>(ctx, A, B, a_dim, b_dim);
-  if (save_result) {
-    (*results)[i * ins.size() + j] = result;
-  }
-  return result;
-}
-
-/**
- * @brief get the optimal order
- */
-std::vector<uint64_t> GetOrder(const std::vector<const framework::Tensor*>& ins,
-                               const std::vector<framework::DDim>& ins_dims) {
-  auto n = ins.size();
-  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
-  std::vector<uint64_t> p(n + 1);
-  for (uint64_t i = 0; i < n; i++) {
-    p[i] = ins_dims[i][0];
-  }
-  p[n] = ins_dims[n - 1][1];
-
-  // m[i, j]: save the lowest cost for multiplying ins[i...j]
-  std::vector<uint64_t> m(n * n, 0);
-  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
-  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
-  // multiply the resulting matrices is the optimal order for ins[i...j]
-  std::vector<uint64_t> order(n * n);
-  for (uint64_t l = 1; l < n; l++) {
-    for (uint64_t i = 0; i < n - l; i++) {
-      auto j = i + l;
-      m[i * n + j] = 0xffffffff;
-      for (uint64_t k = i; k < j; k++) {
-        uint64_t q =
-            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
-        if (q < m[i * n + j]) {
-          m[i * n + j] = q;
-          order[i * n + j] = k;
-        }
-      }
-    }
-  }
-  return order;
-}
-
-template <typename DeviceContext, typename T>
-static inline framework::Tensor MultiDotMatChainOrder(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims, const bool save_result,
-    std::vector<framework::Tensor>* results) {
-  auto order = GetOrder(ins, ins_dims);
-  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0,
-                                       ins.size() - 1, save_result, results);
-}
-
-inline void GetDims(const std::vector<const framework::Tensor*>& ins,
-                    std::vector<framework::DDim>* ins_dims) {
-  const auto n = ins.size();
-  for (size_t i = 0; i < n; i++) {
-    (*ins_dims)[i] = ins[i]->dims();
-    if (i == 0 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
-    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
-    }
-  }
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel {
   }
 };
 
-/**
- * 1. there are only 2 matrices: direct matrix multiplication A*B
- * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C),
- *  choose the least cost order for calculation
- * 3. more than 3 matrices: call MultiDotMatChainOrder
- */
-template <typename DeviceContext, typename T>
-class MultiDotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto n = ins.size();
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    const T scale = static_cast<T>(1.0);
-    if (n == 2) {
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ma, Nb});
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
-      } else {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ka * Nc * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ka, Nc});
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
-      }
-    } else {
-      std::vector<framework::Tensor> results;
-      const auto tmp = MultiDotMatChainOrder<DeviceContext, T>(
-          ctx, ins, ins_dims, false, &results);
-      auto out_dim = out->dims();
-      *out = tmp;
-      out->Resize(out_dim);
-    }
-  }
-};
-
 class MultiDotOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class MultiDotGradKernel : public framework::OpKernel<T> {
- public:
-  /**
-   * @brief calculate dA and dB
-   * dA = dout * transpose(B)
-   * dB = transpose(A) * dout
-   */
-  void CalcGrad(const framework::ExecutionContext& ctx,
-                const framework::Tensor& dout, const framework::Tensor& A,
-                const framework::Tensor& B, const framework::DDim& dout_dim,
-                const framework::DDim& a_dim, const framework::DDim& b_dim,
-                framework::Tensor* dA, framework::Tensor* dB) const {
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
-    T alpha = static_cast<T>(1.0);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
-    blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
-  }
-
-  /**
-   * @brief calculate multi matrix multiplication grad by a chain order
-   * @param
-   * dout: the grad of multi matrix multiplication out
-   * dx: the out grad of inputs
-   * ins: the input tensors
-   * ins_dims: the shape of ins after reshape
-   * order: the optimal order
-   * i: the left of sub chain
-   * j: the righe of sub chain
-   * results: the intermediate result of farward
-   */
-  void MatChainMulGrad(const framework::ExecutionContext& ctx,
-                       const framework::Tensor& dout,
-                       std::vector<framework::Tensor*>* dx,
-                       const std::vector<const framework::Tensor*>& ins,
-                       const framework::DDim& dout_dim,
-                       const std::vector<framework::DDim>& ins_dims,
-                       const std::vector<uint64_t>& order, const uint64_t i,
-                       const uint64_t j,
-                       const std::vector<framework::Tensor>& results) const {
-    if (i == j) {
-      *((*dx)[i]) = dout;
-      return;
-    }
-
-    const auto n = ins.size();
-    const auto right = order[i * n + j];
-    const auto left = order[i * n + j] + 1;
-    // get the multi result of left sub chain
-    const auto* A = &results[i * n + right];
-    framework::DDim a_dim = A->dims();
-    if (i == right) {
-      A = ins[i];
-      a_dim = ins_dims[i];
-    }
-    // get the multi result of right sub chain
-    const auto* B = &results[left * n + j];
-    framework::DDim b_dim = B->dims();
-    if (left == j) {
-      B = ins[j];
-      b_dim = ins_dims[j];
-    }
-    framework::Tensor dA, dB;
-    dA.Resize({dout_dim[0], b_dim[0]});
-    dB.Resize({a_dim[1], dout_dim[1]});
-    dA.mutable_data<T>(ctx.GetPlace());
-    dB.mutable_data<T>(ctx.GetPlace());
-
-    CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
-    MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right,
-                    results);
-    MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j,
-                    results);
-  }
-
-  void MultiDotGradMatChainOrder(
-      const framework::ExecutionContext& ctx, const framework::Tensor& dout,
-      const std::vector<const framework::Tensor*>& ins,
-      const framework::DDim& dout_dim,
-      const std::vector<framework::DDim>& ins_dims,
-      std::vector<framework::Tensor*>* dx) const {
-    auto order = GetOrder(ins, ins_dims);
-    auto n = ins.size();
-    std::vector<framework::Tensor> results(n * n);
-    MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1, true,
-                                  &results);
-    MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1,
-                    results);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    auto place = ctx.GetPlace();
-
-    const auto n = ins.size();
-    for (size_t i = 0; i < n; i++) {
-      dx[i]->mutable_data<T>(place);
-    }
-
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    framework::DDim dout_dim = dout.dims();
-    if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
-      dout_dim = phi::make_ddim({1, 1});
-    } else if (ins[0]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({1, dout_dim[0]});
-      }
-    } else if (ins[n - 1]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({dout_dim[0], 1});
-      }
-    }
-
-    T alpha = static_cast<T>(1);
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    if (n == 2) {
-      CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
-               dx[0], dx[1]);
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ma, Nb});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({mat_dim_dout.height_, Nb});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(),
-                 ins_dims[2], &tmp_dout, dx[2]);
-        CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0],
-                 ins_dims[1], dx[0], dx[1]);
-      } else {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ka, Nc});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({Ka, mat_dim_dout.width_});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0],
-                 tmp_dout.dims(), dx[0], &tmp_dout);
-        CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1],
-                 ins_dims[2], dx[1], dx[2]);
-      }
-    } else {
-      MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx);
-      if (ins[n - 1]->dims().size() == 1) {
-        dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
-      }
-    }
-  }
-};
-
 template <typename T>
 class MultiDotOpGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, float>);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-#endif
diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
new file mode 100644
index 00000000000..2cd75404be8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multi_dot_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
new file mode 100644
index 00000000000..a4249a98e46
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
new file mode 100644
index 00000000000..6761d945e95
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(multi_dot_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiDotGradKernel,
+                   float,
+                   double,
+                   float16) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
new file mode 100644
index 00000000000..60b1fce5ddd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(
+    multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {}
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
new file mode 100644
index 00000000000..0833e94fe2c
--- /dev/null
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -0,0 +1,456 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline DenseTensor MatMul(const Context& ctx,
+                          const DenseTensor& matrix_a,
+                          const DenseTensor& matrix_b,
+                          const phi::DDim& a_dim,
+                          const phi::DDim& b_dim) {
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor matrix_c;
+  phi::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
+  matrix_c.Resize(c_dim);
+  ctx.template Alloc<T>(&matrix_c);
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
+  const T alpha = static_cast<T>(1.0);
+  blas.MatMul(matrix_a.data<T>(),
+              mat_dim_a,
+              matrix_b.data<T>(),
+              mat_dim_b,
+              alpha,
+              matrix_c.data<T>(),
+              T(0));
+  return matrix_c;
+}
+
+/**
+ * @brief Recursively calculate matrix multiplication according to the optimal
+ * order
+ * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
+ *
+ * @param
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * save_result: set true by backward
+ * results: save the intermediate result during backward
+ */
+template <typename Context, typename T>
+inline DenseTensor MatChainMul(const Context& ctx,
+                               const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims,
+                               const std::vector<uint64_t>& order,
+                               const uint64_t i,
+                               const uint64_t j,
+                               const bool save_result,
+                               std::vector<DenseTensor>* results) {
+  if (i == j) {
+    return *ins[i];
+  }
+
+  const auto A = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         i,
+                                         order[i * ins.size() + j],
+                                         save_result,
+                                         results);
+  phi::DDim a_dim = A.dims();
+  if (i == order[i * ins.size() + j]) {
+    a_dim = ins_dims[i];
+  }
+
+  const auto B = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         order[i * ins.size() + j] + 1,
+                                         j,
+                                         save_result,
+                                         results);
+  phi::DDim b_dim = B.dims();
+  if (j == order[i * ins.size() + j] + 1) {
+    b_dim = ins_dims[j];
+  }
+
+  auto result = MatMul<Context, T>(ctx, A, B, a_dim, b_dim);
+  if (save_result) {
+    (*results)[i * ins.size() + j] = result;
+  }
+  return result;
+}
+
+/**
+ * @brief get the optimal order
+ */
+template <typename Context, typename T>
+std::vector<uint64_t> GetOrder(const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims) {
+  auto n = ins.size();
+  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
+  std::vector<uint64_t> p(n + 1);
+  for (uint64_t i = 0; i < n; i++) {
+    p[i] = ins_dims[i][0];
+  }
+  p[n] = ins_dims[n - 1][1];
+
+  // m[i, j]: save the lowest cost for multiplying ins[i...j]
+  std::vector<uint64_t> m(n * n, 0);
+  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
+  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
+  // multiply the resulting matrices is the optimal order for ins[i...j]
+  std::vector<uint64_t> order(n * n);
+  for (uint64_t l = 1; l < n; l++) {
+    for (uint64_t i = 0; i < n - l; i++) {
+      auto j = i + l;
+      m[i * n + j] = 0xffffffff;
+      for (uint64_t k = i; k < j; k++) {
+        uint64_t q =
+            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
+        if (q < m[i * n + j]) {
+          m[i * n + j] = q;
+          order[i * n + j] = k;
+        }
+      }
+    }
+  }
+  return order;
+}
+
+template <typename Context, typename T>
+static inline DenseTensor MultiDotMatChainOrder(
+    const Context& ctx,
+    const std::vector<const DenseTensor*>& ins,
+    const std::vector<phi::DDim>& ins_dims,
+    const bool save_result,
+    std::vector<DenseTensor>* results) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  return MatChainMul<Context, T>(
+      ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results);
+}
+
+template <typename Context, typename T>
+inline void GetDims(const std::vector<const DenseTensor*>& ins,
+                    std::vector<phi::DDim>* ins_dims) {
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    (*ins_dims)[i] = ins[i]->dims();
+    if (i == 0 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
+    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out) {
+  auto ins = x;
+  ctx.template Alloc<T>(out);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  auto n = ins.size();
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  const T scale = static_cast<T>(1.0);
+  if (n == 2) {
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ma, Nb});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
+    } else {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ka, Nc});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      std::cout << tmp_out << std::endl;
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
+    }
+  } else {
+    std::vector<DenseTensor> results;
+    const auto tmp =
+        MultiDotMatChainOrder<Context, T>(ctx, ins, ins_dims, false, &results);
+    auto out_dim = out->dims();
+    *out = tmp;
+    out->Resize(out_dim);
+  }
+}
+
+/**
+ * @brief calculate dA and dB
+ * dA = dout * transpose(B)
+ * dB = transpose(A) * dout
+ */
+template <typename Context, typename T>
+void CalcGrad(const Context& ctx,
+              const DenseTensor& dout,
+              const DenseTensor& A,
+              const DenseTensor& B,
+              const phi::DDim& dout_dim,
+              const phi::DDim& a_dim,
+              const phi::DDim& b_dim,
+              DenseTensor* dA,
+              DenseTensor* dB) {
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
+  T alpha = static_cast<T>(1.0);
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+  blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
+  blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
+}
+
+/**
+ * @brief calculate multi matrix multiplication grad by a chain order
+ * @param
+ * dout: the grad of multi matrix multiplication out
+ * dx: the out grad of inputs
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * results: the intermediate result of farward
+ */
+template <typename Context, typename T>
+void MatChainMulGrad(const Context& ctx,
+                     const DenseTensor& dout,
+                     std::vector<DenseTensor*>* dx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const phi::DDim& dout_dim,
+                     const std::vector<phi::DDim>& ins_dims,
+                     const std::vector<uint64_t>& order,
+                     const uint64_t i,
+                     const uint64_t j,
+                     const std::vector<DenseTensor>& results) {
+  if (i == j) {
+    *((*dx)[i]) = dout;
+    return;
+  }
+
+  const auto n = ins.size();
+  const auto right = order[i * n + j];
+  const auto left = order[i * n + j] + 1;
+  // get the multi result of left sub chain
+  const auto* A = &results[i * n + right];
+  phi::DDim a_dim = A->dims();
+  if (i == right) {
+    A = ins[i];
+    a_dim = ins_dims[i];
+  }
+  // get the multi result of right sub chain
+  const auto* B = &results[left * n + j];
+  phi::DDim b_dim = B->dims();
+  if (left == j) {
+    B = ins[j];
+    b_dim = ins_dims[j];
+  }
+  DenseTensor dA, dB;
+  dA.Resize({dout_dim[0], b_dim[0]});
+  dB.Resize({a_dim[1], dout_dim[1]});
+  ctx.template Alloc<T>(&dA);
+  ctx.template Alloc<T>(&dB);
+
+  CalcGrad<Context, T>(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
+  MatChainMulGrad<Context, T>(
+      ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results);
+  MatChainMulGrad<Context, T>(
+      ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results);
+}
+
+template <typename Context, typename T>
+void MultiDotGradMatChainOrder(const Context& ctx,
+                               const DenseTensor& dout,
+                               const std::vector<const DenseTensor*>& ins,
+                               const phi::DDim& dout_dim,
+                               const std::vector<phi::DDim>& ins_dims,
+                               std::vector<DenseTensor*>* dx) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  auto n = ins.size();
+  std::vector<DenseTensor> results(n * n);
+  MatChainMul<Context, T>(ctx, ins, ins_dims, order, 0, n - 1, true, &results);
+  MatChainMulGrad<Context, T>(
+      ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results);
+}
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad) {
+  auto ins = x;
+  auto dout = out_grad;
+  auto dx = x_grad;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    ctx.template Alloc<T>(dx[i]);
+  }
+
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  phi::DDim dout_dim = dout.dims();
+  if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
+    dout_dim = phi::make_ddim({1, 1});
+  } else if (ins[0]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({1, dout_dim[0]});
+    }
+  } else if (ins[n - 1]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({dout_dim[0], 1});
+    }
+  }
+
+  T alpha = static_cast<T>(1);
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  if (n == 2) {
+    CalcGrad<Context, T>(ctx,
+                         dout,
+                         *ins[0],
+                         *ins[1],
+                         dout_dim,
+                         ins_dims[0],
+                         ins_dims[1],
+                         dx[0],
+                         dx[1]);
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ma, Nb});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({mat_dim_dout.height_, Nb});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           tmp_out,
+                           *ins[2],
+                           dout_dim,
+                           tmp_out.dims(),
+                           ins_dims[2],
+                           &tmp_dout,
+                           dx[2]);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[0],
+                           *ins[1],
+                           tmp_dout.dims(),
+                           ins_dims[0],
+                           ins_dims[1],
+                           dx[0],
+                           dx[1]);
+    } else {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ka, Nc});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({Ka, mat_dim_dout.width_});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           *ins[0],
+                           tmp_out,
+                           dout_dim,
+                           ins_dims[0],
+                           tmp_dout.dims(),
+                           dx[0],
+                           &tmp_dout);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[1],
+                           *ins[2],
+                           tmp_dout.dims(),
+                           ins_dims[1],
+                           ins_dims[2],
+                           dx[1],
+                           dx[2]);
+    }
+  } else {
+    MultiDotGradMatChainOrder<Context, T>(
+        ctx, dout, ins, dout_dim, ins_dims, &dx);
+    if (ins[n - 1]->dims().size() == 1) {
+      dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h
new file mode 100644
index 00000000000..e6d8ecd744e
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_kernel.h b/paddle/phi/kernels/multi_dot_kernel.h
new file mode 100644
index 00000000000..09866e8dde5
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc
new file mode 100644
index 00000000000..598cbd980f3
--- /dev/null
+++ b/paddle/phi/ops/compat/multi_dot_sig.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiDotGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping);
-- 
GitLab


From 6a0d60d27ff44ea425e35afa9b3f4bd884fb6506 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 7 Mar 2022 13:20:07 +0800
Subject: [PATCH 151/272] [bf16] add bf16 kernel: gaussian_random 
 fill_constant  fill_any_like (#40027)

* add gaussian random

* add full

* refine reduce

* refine code

* refine gaussian_random unittest

* add unittest for fill_any_like fill_constant
---
 paddle/fluid/operators/gaussian_random_op.cu  |  3 +-
 .../phi/kernels/funcs/distribution_helper.h   |  9 ++--
 paddle/phi/kernels/gpu/full_kernel.cu         | 10 ++--
 .../phi/kernels/gpu/gaussian_random_kernel.cu | 13 ++++--
 .../kernels/primitive/compute_primitives.h    |  1 +
 .../tests/unittests/test_fill_any_like_op.py  | 21 ++++++++-
 .../tests/unittests/test_fill_constant_op.py  | 21 +++++++++
 .../unittests/test_gaussian_random_op.py      | 46 ++++++++++++++++++-
 8 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 717ec774414..00ce10bfe3b 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -45,7 +45,8 @@ struct GaussianGenerator {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     using MT = typename details::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(mean_, std_);
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
     MT out = dist(rng);
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 3ef39dc55d1..acc31d68b78 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -255,11 +256,13 @@ __global__ void DistributionKernel(size_t size,
   using SType = hiprandStatePhilox4_32_10_t;
 #endif
   size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT args[kCount];
   T result[kCount];
   for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
+    kps::ElementwiseRandom<SType, MT, kCount, 1, DistOp>(
+        &args[0], dist, &state);
+    kps::ElementwiseUnary<MT, T, kCount, 1, 1, TransformOp>(
         &result[0], &args[0], trans);
     kps::WriteData<T, T, kCount, 1, 1, true>(
         out_data + i, &result[0], size - i, 1, stride, 1);
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 1f756bfdbed..a905979f08b 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -63,9 +63,11 @@ void FullLikeKernel(const Context& dev_ctx,
   auto value = val.to<float>();
   using CommonType = typename std::common_type<
       float,
-      typename std::conditional<std::is_same<T, phi::dtype::float16>::value,
-                                float,
-                                T>::type>::type;
+      typename std::conditional<
+          std::is_same<T, phi::dtype::float16>::value ||
+              std::is_same<T, phi::dtype::bfloat16>::value,
+          float,
+          T>::type>::type;
 
   auto common_type_value = static_cast<CommonType>(value);
 
@@ -110,6 +112,7 @@ PD_REGISTER_KERNEL(full,
                    int64_t,
                    bool,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -123,6 +126,7 @@ PD_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index da16800ad02..e2fe2190c1c 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -18,8 +18,8 @@
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
@@ -46,8 +46,9 @@ struct GaussianGenerator {
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
-    using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(mean_, std_);
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
     MT out = dist(rng);
@@ -83,9 +84,10 @@ void GaussianRandomKernel(const Context& dev_ctx,
 
   if (gen_cuda->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
-      using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+      using MT = typename phi::dtype::MPTypeTrait<T>::Type;
       funcs::normal_distribution<MT> dist;
-      funcs::normal_transform<MT> trans(mean, std);
+      funcs::normal_transform<MT> trans(static_cast<MT>(mean),
+                                        static_cast<MT>(std));
       funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
     } else {
       auto seed_offset = gen_cuda->IncrementOffset(1);
@@ -110,5 +112,6 @@ PD_REGISTER_KERNEL(gaussian_random,
                    ALL_LAYOUT,
                    phi::GaussianRandomKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 19427551fb3..632ad00f6d0 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,6 +22,7 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 5bc2d1cda18..9be2e57ff0c 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -21,7 +21,7 @@ from paddle.fluid import Program, program_guard
 import paddle.compat as cpt
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 
 class TestFillAnyLikeOp(OpTest):
@@ -47,6 +47,25 @@ class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
         self.value = 0.0
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillAnyLikeOpBfloat16(OpTest):
+    def setUp(self):
+        self.op_type = "fill_any_like"
+        self.dtype = np.uint16
+        self.value = 0.0
+        self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)}
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {
+            'Out':
+            convert_float_to_uint16(self.value * np.ones_like(self.inputs["X"]))
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
     def init(self):
         self.value = 1.0
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 822c952893e..15071b2b6aa 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -83,6 +83,27 @@ class TestFillConstantOp4(OpTest):
         self.check_output()
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillConstantBF16Op(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.dtype = np.uint16
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 3.8,
+            'dtype': core.VarDesc.VarType.BF16
+        }
+        self.outputs = {'Out': convert_float_to_uint16(np.full((123, 92), 3.8))}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 31caf4bd6be..738441a46d3 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,7 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float
 import paddle
 
 
@@ -65,6 +65,50 @@ class TestGaussianRandomOp(OpTest):
             "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestGaussianRandomBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.set_attrs()
+        self.inputs = {}
+        self.use_mkldnn = False
+        self.attrs = {
+            "shape": [123, 92],
+            "mean": self.mean,
+            "std": self.std,
+            "seed": 10,
+            "dtype": paddle.fluid.core.VarDesc.VarType.BF16,
+            "use_mkldnn": self.use_mkldnn
+        }
+        paddle.seed(10)
+
+        self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
+
+    def set_attrs(self):
+        self.mean = 1.0
+        self.std = 2.
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(
+            self.verify_output, place=core.CUDAPlace(0))
+
+    def verify_output(self, outs):
+        outs = convert_uint16_to_float(outs)
+        self.assertEqual(outs[0].shape, (123, 92))
+        hist, _ = np.histogram(outs[0], range=(-3, 5))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        data = np.random.normal(size=(123, 92), loc=1, scale=2)
+        hist2, _ = np.histogram(data, range=(-3, 5))
+        hist2 = hist2.astype("float32")
+        hist2 /= float(outs[0].size)
+        self.assertTrue(
+            np.allclose(
+                hist, hist2, rtol=0, atol=0.05),
+            "hist: " + str(hist) + " hist2: " + str(hist2))
+
+
 class TestMeanStdAreInt(TestGaussianRandomOp):
     def set_attrs(self):
         self.mean = 1
-- 
GitLab


From b46e49deaff2a98133b7176729874a6f8e9198a6 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 7 Mar 2022 13:23:16 +0800
Subject: [PATCH 152/272] [Phi] Remove storage deps of empty (#40136)

* remove storage deps of empty

* remove invalid empty method

* remove error empty using

* fix test_sparse_utils_dev_api

* revert some sparse change

* add memset for conv grad

* resolve conflict

* resolve conflict

* resolve conflict
---
 paddle/phi/api/lib/data_transform.cc          |  1 +
 paddle/phi/kernels/CMakeLists.txt             | 15 ++++++++--
 paddle/phi/kernels/cast_kernel.h              |  2 +-
 paddle/phi/kernels/complex_kernel.h           |  6 ++--
 paddle/phi/kernels/concat_kernel.h            |  2 +-
 paddle/phi/kernels/dot_kernel.h               |  2 +-
 paddle/phi/kernels/empty_kernel.h             | 23 ++++-----------
 paddle/phi/kernels/flatten_kernel.h           |  2 +-
 paddle/phi/kernels/full_kernel.h              |  4 +--
 paddle/phi/kernels/funcs/reduce_function.h    |  9 +++---
 .../kernels/impl/matmul_grad_kernel_impl.h    | 16 +++++-----
 .../impl/triangular_solve_grad_kernel_impl.h  |  4 +--
 paddle/phi/kernels/math_kernel.h              | 12 ++++----
 paddle/phi/kernels/matmul_kernel.h            |  2 +-
 paddle/phi/kernels/reshape_kernel.h           |  2 +-
 paddle/phi/kernels/scale_kernel.h             |  2 +-
 paddle/phi/kernels/sign_kernel.h              |  2 +-
 .../kernels/sparse/convolution_grad_kernel.h  |  3 ++
 .../phi/kernels/sparse/convolution_kernel.h   | 13 +++++++++
 .../sparse/cpu/convolution_grad_kernel.cc     |  1 +
 .../phi/kernels/sparse/sparse_utils_kernel.h  | 29 ++++++++++---------
 paddle/phi/kernels/split_kernel.h             |  2 +-
 paddle/phi/kernels/transpose_kernel.h         |  2 +-
 paddle/phi/tests/api/scale_api.h              |  1 +
 .../kernels/test_sparse_utils_dev_api.cc      | 29 +++++++++++++++++++
 25 files changed, 116 insertions(+), 70 deletions(-)

diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index ae67e2ebb35..79b8ac6d0b8 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index e9108787082..16fae8d879c 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -9,13 +9,22 @@ add_subdirectory(funcs)
 # phi depends all phi kernel targets
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
+# [ 1. Common kernel compilation dependencies ]
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
-# NOTE: Some kernels depend on some targets that are not commonly used.
+# [ 2. Kernels that most kernels depend on ]
+# There are a few kernels that are very basic operations, and most of the
+# kernels depend on these kernels.
+set(COMMON_BAISC_KERNELS empty_kernel full_kernel)
+kernel_library(empty_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
+
+# [ 3. Kernels with special dependencies ]
+# Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
 set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel)
@@ -24,8 +33,8 @@ kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
 
-# auto parse and build kernel targets by cmake
-register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
+# 4. auto parse and build kernel targets by cmake
+register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
 
 # phi sparse kernels
 add_subdirectory(sparse)
diff --git a/paddle/phi/kernels/cast_kernel.h b/paddle/phi/kernels/cast_kernel.h
index c760b2842d0..5e07388f5fb 100644
--- a/paddle/phi/kernels/cast_kernel.h
+++ b/paddle/phi/kernels/cast_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Cast(const Context& dev_ctx,
                  const DenseTensor& x,
                  DataType out_dtype) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   CastInferMeta(x, out_dtype, &meta_out);
   CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 2c52001ece1..07f93f9b926 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -38,7 +38,7 @@ template <
                          std::is_same<T, phi::dtype::complex<double>>::value,
                      bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ConjKernel<T>(dev_ctx, x, &dense_out);
@@ -64,7 +64,7 @@ template <
                          std::is_same<T, phi::dtype::complex<double>>::value,
                      bool> = true>
 DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   RealAndImagInferMeta(x, &meta_out);
   RealKernel<T>(dev_ctx, x, &dense_out);
@@ -90,7 +90,7 @@ template <
                          std::is_same<T, phi::dtype::complex<double>>::value,
                      bool> = true>
 DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   RealAndImagInferMeta(x, &meta_out);
   ImagKernel<T>(dev_ctx, x, &dense_out);
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index ed969e963ec..4e72159aeca 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -38,7 +38,7 @@ DenseTensor Concat(const Context& dev_ctx,
     meta_x_ptr.push_back(&meta_x.back());
   }
 
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
diff --git a/paddle/phi/kernels/dot_kernel.h b/paddle/phi/kernels/dot_kernel.h
index 9377fba204b..9c7703440d8 100644
--- a/paddle/phi/kernels/dot_kernel.h
+++ b/paddle/phi/kernels/dot_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Dot(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DotInferMeta(x, y, &meta_out);
   DotKernel<T, Context>(dev_ctx, x, y, &dense_out);
diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h
index 0b8d95ee94f..f66f4419fd7 100644
--- a/paddle/phi/kernels/empty_kernel.h
+++ b/paddle/phi/kernels/empty_kernel.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -34,28 +34,17 @@ void EmptyLikeKernel(const Context& dev_ctx,
                      DataType dtype,
                      DenseTensor* out);
 
-// TODO(chenweihang): the tensor creation method need to be replaced later,
-// all kernel api call Empty here instead of making tensor self
 template <typename Context>
 DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
-  phi::DenseTensor dense_out(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(meta));
+  phi::DenseTensor dense_out;
+  dense_out.set_meta(meta);
+  dev_ctx.Alloc(&dense_out, dense_out.dtype());
   return dense_out;
 }
 
-template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx) {
-  return Empty(dev_ctx,
-               {paddle::experimental::CppTypeToDataType<T>::Type(),
-                {-1},
-                DataLayout::NCHW});
-}
-
 template <typename T, typename Context>
 DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -65,7 +54,7 @@ DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
 
 template <typename T, typename Context>
 DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/flatten_kernel.h b/paddle/phi/kernels/flatten_kernel.h
index de57dcf2e8d..808af7d9b7b 100644
--- a/paddle/phi/kernels/flatten_kernel.h
+++ b/paddle/phi/kernels/flatten_kernel.h
@@ -40,7 +40,7 @@ DenseTensor Flatten(const Context& dev_ctx,
                     const DenseTensor& x,
                     int start_axis,
                     int stop_axis) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   FlattenInferMeta(x, start_axis, stop_axis, &meta_out);
   FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index 05929ba83f3..c44f048051d 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -41,7 +41,7 @@ template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
                  const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -53,7 +53,7 @@ template <typename T, typename Context>
 DenseTensor FullLike(const Context& dev_ctx,
                      const DenseTensor& x,
                      const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 7df772682ec..ce6bb0d559c 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -344,9 +344,8 @@ struct ReduceConfig {
                      const phi::GPUContext& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
-      tmp->ResizeAndAllocate(phi::make_ddim(
+      tmp->Resize(phi::make_ddim(
           {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
-
       output_data = dev_ctx.Alloc<Ty>(tmp);
     } else {
       output_data = y_data;
@@ -1053,8 +1052,8 @@ CubTensorReduceImpl(const Tx* x_data,
                             reducer,
                             reducer.initial(),
                             stream);
-  phi::DenseTensor tmp =
-      phi::Empty<uint8_t>(dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
+  phi::DenseTensor tmp = phi::Empty<uint8_t, phi::GPUContext>(
+      dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
 
   auto* temp_storage = dev_ctx.Alloc<uint8_t>(&tmp);
 
@@ -1106,7 +1105,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
   // y_data;
 
   phi::DDim tmp_ddim;
-  phi::DenseTensor tmp = phi::Empty<Ty>(dev_ctx);
+  phi::DenseTensor tmp;
 
   auto x_data = x.data<Tx>();
   auto y_data = y->data<Ty>();
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 7c8d10e0565..d06bdc55030 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -329,8 +329,8 @@ void MatmulGradKernel(const Context& dev_ctx,
     x_conj = Conj<T>(dev_ctx, x);
     y_conj = Conj<T>(dev_ctx, y);
 
-    DenseTensor dx_help = Empty<T, Context>(dev_ctx);
-    DenseTensor dy_help = Empty<T, Context>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -686,8 +686,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
       y_conj = Conj<T>(dev_ctx, y);
     }
 
-    DenseTensor dx_help = Empty<T>(dev_ctx);
-    DenseTensor dy_help = Empty<T>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -1373,10 +1373,10 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
     VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
                "wastes the memory. So we should avoid the case in reality";
 
-    DenseTensor out_dx_help = Empty<T>(dev_ctx);
-    DenseTensor out_dy_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddx_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddy_help = Empty<T>(dev_ctx);
+    DenseTensor out_dx_help;
+    DenseTensor out_dy_help;
+    DenseTensor out_d_ddx_help;
+    DenseTensor out_d_ddy_help;
 
     if (out_d_dout) {
       ddx_conj = Conj<T>(dev_ctx, ddx);
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
index a6868ebe6ca..9b1e4b1d3a6 100644
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -49,7 +49,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
   DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims_array);
   if (dy) {
     // calculate x's conjugate for complex
-    DenseTensor x_conj = phi::Empty<T, Context>(dev_ctx);
+    DenseTensor x_conj;
     x_conj.Resize(x.dims());
 
     phi::funcs::ForRange<Context> x_for_range(dev_ctx, x.numel());
@@ -76,7 +76,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
   DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
   if (dx) {
     // calculate x's conjugate for complex
-    DenseTensor out_conj = phi::Empty<T, Context>(dev_ctx);
+    DenseTensor out_conj;
     out_conj.Resize(out.dims());
 
     phi::funcs::ForRange<Context> out_for_range(dev_ctx, out.numel());
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index 342393d79bd..fe8f3b749cd 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -109,7 +109,7 @@ template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -120,7 +120,7 @@ template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -131,7 +131,7 @@ template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -142,7 +142,7 @@ template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -154,7 +154,7 @@ DenseTensor Mean(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& axis,
                  bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
@@ -167,7 +167,7 @@ DenseTensor Sum(const Context& dev_ctx,
                 const std::vector<int64_t>& axis,
                 DataType dtype,
                 bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
   SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index 1f1cb22c271..b524b9e5863 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -35,7 +35,7 @@ DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& y,
                    bool transpose_x = false,
                    bool transpose_y = false) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out);
   MatmulKernel<T, Context>(dev_ctx, x, y, transpose_x, transpose_y, &dense_out);
diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h
index 1a3d0db8a8a..848f162a2a8 100644
--- a/paddle/phi/kernels/reshape_kernel.h
+++ b/paddle/phi/kernels/reshape_kernel.h
@@ -38,7 +38,7 @@ template <typename T, typename Context>
 DenseTensor Reshape(const Context& dev_ctx,
                     const DenseTensor& x,
                     const std::vector<int64_t>& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   InferMetaFromVecValue(x, shape, &meta_out);
   ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h
index 22e6efb03ac..7537dc1130b 100644
--- a/paddle/phi/kernels/scale_kernel.h
+++ b/paddle/phi/kernels/scale_kernel.h
@@ -34,7 +34,7 @@ DenseTensor Scale(const Context& dev_ctx,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ScaleKernel<T, Context>(
diff --git a/paddle/phi/kernels/sign_kernel.h b/paddle/phi/kernels/sign_kernel.h
index 7ee1145012d..4b5900d90f4 100644
--- a/paddle/phi/kernels/sign_kernel.h
+++ b/paddle/phi/kernels/sign_kernel.h
@@ -25,7 +25,7 @@ void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   SignKernel<T, Context>(dev_ctx, x, &dense_out);
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index 1a6ac852448..3ada3473355 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -45,6 +47,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const int groups) {
   DenseTensor x_grad = phi::Empty<T, Context>(dev_ctx);
   DenseTensor kernel_grad = phi::Empty<T, Context>(dev_ctx);
+  // TODO(zhangkaihuo): call InferMeta func here
   Conv3dGradKernel<T, Context>(dev_ctx,
                                x,
                                rulebook,
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 71160a6365d..1c1e62c8306 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,11 +14,24 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
+
+template <typename T, typename Context>
+DenseTensor Empty(const Context& dev_ctx) {
+  phi::DenseTensor dense_out(
+      phi::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      {paddle::experimental::CppTypeToDataType<T>::Type(),
+       {-1},
+       DataLayout::NCHW});
+  return dense_out;
+}
+
 namespace sparse {
 
 struct Dims4D {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index d4f770ce871..cb6cf435435 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -74,6 +74,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
   dev_ctx.Alloc(
       kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
   T* d_kernel_ptr = kernel_grad->data<T>();
+  memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
 
   Gather<T>(x.non_zero_elements().data<T>(),
             rulebook_ptr + rulebook_len,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d96d134a26b..c83b2130ed4 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
@@ -63,8 +64,8 @@ template <typename T, typename Context>
 SparseCooTensor DenseToSparseCoo(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const int64_t sparse_dim) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   return coo;
@@ -78,8 +79,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCooTensor SparseCsrToCoo(const Context& dev_ctx,
                                const SparseCsrTensor& x) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
@@ -93,9 +94,9 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCsrTensor SparseCooToCsr(const Context& dev_ctx,
                                const SparseCooTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   SparseCooToCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -113,8 +114,8 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
                     phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   SparseCooToCsrKernel<T, Context>(dev_ctx, coo, out);
@@ -122,9 +123,9 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 SparseCsrTensor DenseToSparseCsr(const Context& dev_ctx, const DenseTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   DenseToSparseCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -148,8 +149,8 @@ template <typename T, typename Context>
 void SparseCsrToDenseKernel(const Context& dev_ctx,
                             const SparseCsrTensor& x,
                             DenseTensor* out) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   SparseCooToDenseKernel<T, Context>(dev_ctx, coo, out);
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 840fe4366ce..e42b25e60c4 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -50,7 +50,7 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
   result.reserve(out_number);
 
   for (size_t i = 0; i < out_number; ++i) {
-    result.emplace_back(phi::Empty<T, Context>(dev_ctx));
+    result.emplace_back(DenseTensor());
     out_meta.emplace_back(&result.back());
     out_meta_ptr.push_back(&out_meta.back());
   }
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 3d89b324bab..b8d7fbaa275 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -32,7 +32,7 @@ template <typename T, typename Context>
 DenseTensor Transpose(const Context& dev_ctx,
                       const DenseTensor& x,
                       const std::vector<int>& axis) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   TransposeInferMeta(x, axis, &meta_out);
   TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index d93f00129b9..6b9bb7aecef 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index 3e2ad0495f3..b8f214b79e2 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -90,6 +90,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
 
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out =
@@ -300,6 +304,11 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_cpu, csr);
   CheckResult<T, int64_t>(&dev_ctx_cpu,
                           cpu_sparse_out,
@@ -473,6 +482,11 @@ void TestCooToCsr(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_cpu, coo);
   CheckCsrResult<T, int64_t>(&dev_ctx_cpu,
                              cpu_sparse_out,
@@ -563,6 +577,11 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
@@ -667,6 +686,11 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           const int64_t non_zero_num,
                           const int64_t sparse_dim) {
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
@@ -836,6 +860,11 @@ void TestSparseCsrToDense(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   DenseTensor cpu_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_cpu, csr);
   int cmp_cpu = memcmp(cpu_sparse_out.data<T>(),
                        dense_data.data(),
-- 
GitLab


From d255bfe0203bb81b8c68b86d77ed14c350beaf52 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Mon, 7 Mar 2022 13:25:57 +0800
Subject: [PATCH 153/272] fix_conv2d_trt_convert_test_case (#39882)

* fix_conv2d_trt_convert_test_case

* fix_conv2d_trt_convert_test_case

* fix_conv2d_trt_convert_test_case

* fix_conv2d_trt_convert_test_case
---
 .../test_trt_convert_conv2d_transpose.py      | 44 ++++++-------------
 .../test_trt_convert_deformable_conv.py       | 14 +-----
 ..._trt_convert_depthwise_conv2d_transpose.py | 44 ++++++-------------
 3 files changed, 29 insertions(+), 73 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index e21d67839eb..65fc35f9c56 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -37,6 +37,13 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -175,9 +182,9 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -187,41 +194,18 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
index 9d29034d7fe..c692e92861b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
@@ -147,7 +147,7 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
             if len(attrs[0]['paddings']) == 4:
                 return 1, 2
             else:
-                return 1, 2
+                return 1, 4
 
         attrs = [
             program_config.ops[i].attrs
@@ -160,20 +160,8 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), 1e-5
 
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(program_config.ops[0].attrs["strides"]) != 2:
-                return False
-
-            return True
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "In deformable conv, length of Attr(strides) should be 2.")
-
     def test(self):
         self.trt_param.workspace_size = 1 << 28
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 66a007f64b6..5f77e7de0df 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -40,6 +40,13 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         if inputs['input_data'].shape[1] != attrs[0]['groups']:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -139,9 +146,9 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -151,41 +158,18 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
-- 
GitLab


From 55a3bfbd0c8b07f3eda70ae4206efc6389872622 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 7 Mar 2022 14:05:12 +0800
Subject: [PATCH 154/272] [Phi] Fix macro name typo (#40204)

---
 paddle/fluid/operators/optimizers/adadelta_op.cc | 4 ++--
 paddle/fluid/operators/optimizers/adamax_op.cc   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 3cafbce04d3..315831ddc0f 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -82,8 +82,8 @@ $$
 
 namespace ops = paddle::operators;
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor,
-                            PT_INFER_META(phi::AdadeltaInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor,
+                            PD_INFER_META(phi::AdadeltaInferMeta));
 REGISTER_OPERATOR(
     adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index 29f3d3b09de..036839dd130 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -92,8 +92,8 @@ division by 0 error.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor,
-                            PT_INFER_META(phi::AdamaxInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor,
+                            PD_INFER_META(phi::AdamaxInferMeta));
 
 REGISTER_OPERATOR(
     adamax, ops::AdamaxOp, ops::AdamaxOpMaker,
-- 
GitLab


From 0fb6bca45d3d6c879a55d5cd5c388da713eec780 Mon Sep 17 00:00:00 2001
From: Wei Shengyu <weisy11@163.com>
Date: Mon, 7 Mar 2022 14:11:28 +0800
Subject: [PATCH 155/272] fix infer shapes of pool_with_index (#40139)

* dbg pool infer shapes

* dbg

* fix format
---
 paddle/fluid/operators/pool_with_index_op.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index e0c24935b47..d061f9ae056 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -81,8 +81,12 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
       output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
     } else {
       for (size_t i = 0; i < ksize.size(); ++i) {
-        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                 paddings[i], strides[i]));
+        if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) {
+          output_shape.push_back(in_x_dims[i + 2]);
+        } else {
+          output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                   paddings[i], strides[i]));
+        }
       }
     }
     ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-- 
GitLab


From c52a664e86a53c77a3ee33400edb49de36d81f4e Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 7 Mar 2022 14:33:58 +0800
Subject: [PATCH 156/272] [Phi]Move elementwise_div grad/double grad Kernel to
 Phi (#40172)

* move elementwise_div grad

* change mutable_data to alloc

* fix compile bugs
---
 .../new_executor/standalone_executor_test.cc  |   2 +-
 .../elementwise/elementwise_div_op.cc         |  36 ---
 .../elementwise/elementwise_div_op.cu         |  96 --------
 .../elementwise/elementwise_div_op.h          | 211 ------------------
 .../elementwise/elementwise_functor.h         |  61 -----
 .../elementwise/elementwise_op_function.h     |  71 +-----
 .../test_elementwise_div_grad_grad.cc         |   2 +-
 .../kernels/cpu/elementwise_grad_kernel.cc    |  37 ++-
 paddle/phi/kernels/elementwise_grad_kernel.h  |  21 ++
 paddle/phi/kernels/funcs/broadcast_function.h |  20 ++
 .../phi/kernels/funcs/elementwise_functor.h   |  68 ++++++
 .../phi/kernels/funcs/elementwise_grad_base.h |  27 +++
 paddle/phi/kernels/gpu/elementwise_grad.h     | 126 +++++++++++
 .../kernels/gpu/elementwise_grad_kernel.cu    |  62 ++++-
 .../impl/elementwise_grad_kernel_impl.h       | 156 +++++++++++++
 paddle/phi/kernels/math_kernel.cc             |   1 +
 paddle/phi/ops/compat/elementwise_sig.cc      |  22 ++
 17 files changed, 547 insertions(+), 472 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_div_op.cu

diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 2c3359ffa8e..62d87b6917e 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -54,7 +54,7 @@ USE_OP(slice_grad);
 USE_OP(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
 USE_OP(sgd);
 USE_OP(squared_l2_norm);
 USE_OP(memcpy_h2d);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 38cd232e4d1..13fd9b81a87 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -102,42 +102,6 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad,
                   ops::ElementwiseDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_div)
     .AddCheckpoint(
         R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
deleted file mode 100644
index 9eb4b0352e5..00000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  const auto place = ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::float16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index c58a7f36548..e9adb9abdb5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -20,142 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-void default_elementwise_sub(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          SubFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseSubFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void default_elementwise_div(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          DivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseDivFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
-    phi::DivideRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *pt_x.get(), *pt_y.get(), axis, pt_z.get());
-  }
-};
-
-template <typename T>
-struct DivGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
-};
-
-template <typename T>
-struct DivGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout / y_conj;
-  }
-};
-
-template <typename T>
-struct DivGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * out / y;
-  }
-};
-
-template <typename T>
-struct DivGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> out_div_y_conj((out / y).real,
-                                                -(out / y).imag);
-    return -dout * out_div_y_conj;
-  }
-};
-
-template <typename T>
-struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-
-  ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseDivGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::Tensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Input<Tensor>("Out");
-    auto* ddX = ctx.Input<Tensor>("DDX");
-    auto* ddY = ctx.Input<Tensor>("DDY");
-    auto* dX = ctx.Input<Tensor>("DX");
-
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dOut = ctx.Output<Tensor>("DOut");
-    auto* ddOut = ctx.Output<Tensor>("DDOut");
-
-    int axis = ctx.Attr<int>("axis");
-
-    if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    // ddX_safe == null ? 0 : ddX
-    // ddY_safe == null ? 0 : ddY
-    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dX, ddX, &ddX_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
-
-    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    // dOut = - dX * ddY
-    // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
-    // inplace ddx
-    Tensor tmp;
-    if (dOut) {
-      tmp = *dOut;
-    } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      tmp = ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
-    }
-    if (dY) {
-      // dX_div_Y = dX / Y;
-      Tensor dX_div_Y = tmp;
-      default_elementwise_div<DeviceContext, T>(ctx, dX, Y, &dX_div_Y);
-
-      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-
-      // dY = Out * dX * ddY / Y - dX * ddX / Y
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
-          ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
-          DivGradDX<T>(), DivDoubleDY<T>());
-    }
-
-    if (ddOut) {
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
-      default_elementwise_sub<DeviceContext, T>(ctx, &ddX_safe, &tmp, &tmp);
-      default_elementwise_div<DeviceContext, T>(ctx, &tmp, Y, ddOut);
-    }
-
-    if (dOut) {
-      // dOut = - dX * ddY
-      default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*dOut);
-      dout.device(place) = static_cast<T>(-1) * dout;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 86f5be3071c..8e0bf78e9b7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -90,67 +90,6 @@ struct MinFunctor {
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
-template <typename InT, typename OutT>
-struct DivGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    // dx = dout / y
-    // dy = - dout * out / y
-    phi::Array<OutT, 2> outs;
-    outs[0] = a / c;
-    outs[1] = -a * b / c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    Complex<InT> c_conj(c.real, -c.imag);
-    Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
-    outs[0] = a / c_conj;
-    outs[1] = -a * out_div_c_conj;
-    return outs;
-  }
-};
-
-// Float div grad
-template <typename T>
-struct DivGradXFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-
-// Complex div grad
-template <typename T>
-struct DivGradXFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a / b_conj;
-  }
-};
-
-// Float mul and div
-template <typename T>
-struct DivGradYFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
-    return -a * b / c;
-  }
-};
-
-// Complex mul and div
-template <typename T>
-struct DivGradYFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b,
-                                          const Complex<T> c) const {
-    Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
-    return -a * out_div_c_conj;
-  }
-};
-
 // Fmax
 template <typename T>
 struct FMaxFunctor {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 61862aa9f87..80b07721f0b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -45,6 +45,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
 #endif
 
@@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                          const framework::Tensor &dout, int axis,
                          framework::Tensor *dx, framework::Tensor *dy,
                          DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
-                                               Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  } else {
-    phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  }
+  phi::funcs::ElemwiseGradCompute<DeviceContext, T, DX_OP, DY_OP, Tout>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
 }
 
 // It is a common implementation to compute binary calculation with the support
@@ -1174,14 +1167,6 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
 }
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
-                   framework::Tensor *src, framework::Tensor *dst) {
-  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
-  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
-      dev_ctx.stream());
-}
 
 template <ElementwiseType ET, typename T, typename Functor>
 void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
@@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
                      std::vector<const framework::Tensor *> ins,
                      const framework::Tensor *dout, framework::Tensor *dx,
                      framework::Tensor *dy, Functor func) {
-  framework::Tensor tmp_dx;
-  framework::Tensor tmp_dy;
-  dx->mutable_data<T>(place);
-  dy->mutable_data<T>(place);
-  std::vector<framework::Tensor *> outs;
-  if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
-    outs = {dx, dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, dy};
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    outs = {dx, &tmp_dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, &tmp_dy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(
-      dev_ctx, ins, &outs, axis, func);
-
-  if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  }
+  phi::GetGradXAndYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dx, dy,
+                                       func);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
                     std::vector<const framework::Tensor *> ins,
                     const framework::Tensor *dout, framework::Tensor *dxy,
                     Functor func) {
-  framework::Tensor tmp_dxy;
-  dxy->mutable_data<T>(place);
-
-  std::vector<framework::Tensor *> outs;
-  if (dxy->dims() != dout->dims()) {
-    tmp_dxy.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dxy};
-  } else {
-    outs = {dxy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs,
-                                                           axis, func);
-  if (dxy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
-  }
+  phi::GetGradXOrYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dxy,
+                                      func);
 }
 
 #endif
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 9aa206efed8..7890d634e99 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -28,7 +28,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index e48ee805959..c9177f1c46e 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -18,7 +18,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
@@ -108,6 +107,20 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -171,3 +184,25 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index a1b296e326f..bcd5a98f07e 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -64,4 +64,25 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               int axis,
                               DenseTensor* ddout);
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index aab31cfbd55..7634c246273 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -592,5 +592,25 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
 
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T>(
+        dev_ctx, x, y, axis, InverseFunctor(), z);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index c0a3985cd17..5615a450b5c 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -92,5 +93,72 @@ struct InverseDivideFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; }
 };
 
+template <typename T>
+using ComplexType = phi::dtype::complex<T>;
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    // dx = dout / y
+    // dy = - dout * out / y
+    phi::Array<OutT, 2> outs;
+    outs[0] = a / c;
+    outs[1] = -a * b / c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    ComplexType<InT> out_div_c_conj((b / c).real, -(b / c).imag);
+    outs[0] = a / c_conj;
+    outs[1] = -a * out_div_c_conj;
+    return outs;
+  }
+};
+
+// Float div grad
+template <typename T>
+struct DivGradXFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+// ComplexType div grad
+template <typename T>
+struct DivGradXFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a / b_conj;
+  }
+};
+
+// Float mul and div
+template <typename T>
+struct DivGradYFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
+    return -a * b / c;
+  }
+};
+
+// ComplexType mul and div
+template <typename T>
+struct DivGradYFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b,
+                                              const ComplexType<T> c) const {
+    ComplexType<T> out_div_c_conj((b / c).real, -(b / c).imag);
+    return -a * out_div_c_conj;
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index dff0cfe5b8b..17bf8735873 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -24,6 +24,7 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #endif
 
@@ -1758,5 +1759,31 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
 
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradCompute(const DeviceContext &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &y,
+                         const DenseTensor &out,
+                         const DenseTensor &dout,
+                         int axis,
+                         DenseTensor *dx,
+                         DenseTensor *dy,
+                         DX_OP dx_op,
+                         DY_OP dy_op) {
+  const DDim &x_dim = x.dims();
+  const DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 20799f4e37b..b356f19555f 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -14,12 +14,101 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
+template <typename T>
+void ReduceWrapper(const GPUContext &dev_ctx,
+                   int axis,
+                   DenseTensor *src,
+                   DenseTensor *dst) {
+  std::vector<int> reduce_dims =
+      funcs::GetReduceDim(dst->dims(), src->dims(), axis);
+  funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx,
+      *src,
+      dst,
+      kps::IdentityFunctor<T>(),
+      reduce_dims,
+      dev_ctx.stream());
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXAndYOut(const GPUContext &dev_ctx,
+                     const Place &place,
+                     int axis,
+                     std::vector<const DenseTensor *> ins,
+                     const DenseTensor &dout,
+                     DenseTensor *dx,
+                     DenseTensor *dy,
+                     Functor func) {
+  DenseTensor tmp_dx;
+  DenseTensor tmp_dy;
+  dev_ctx.Alloc<T>(dx);
+  dev_ctx.Alloc<T>(dy);
+  std::vector<DenseTensor *> outs;
+  if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) {
+    outs = {dx, dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, dy};
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    outs = {dx, &tmp_dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, &tmp_dy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T, decltype(func), 2>(
+      dev_ctx, ins, &outs, axis, func);
+
+  if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXOrYOut(const GPUContext &dev_ctx,
+                    const Place &place,
+                    int axis,
+                    std::vector<const DenseTensor *> ins,
+                    const DenseTensor &dout,
+                    DenseTensor *dxy,
+                    Functor func) {
+  DenseTensor tmp_dxy;
+  dev_ctx.Alloc<T>(dxy);
+
+  std::vector<DenseTensor *> outs;
+  if (dxy->dims() != dout.dims()) {
+    tmp_dxy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dxy);
+    outs = {&tmp_dxy};
+  } else {
+    outs = {dxy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  if (dxy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
+  }
+}
+
 /*
 ******************************
     Add Grad
@@ -243,4 +332,41 @@ void elementwise_sub_grad(const GPUContext &ctx,
       dx->mutable_data<T>(ctx.GetPlace()),
       dy->mutable_data<T>(ctx.GetPlace()));
 }
+/*
+******************************
+    Div Grad
+******************************
+*/
+template <typename T>
+void ElementwiseDivGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis = -1) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index d00888aee67..45c8b9a2163 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -15,9 +15,11 @@
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
@@ -102,6 +104,38 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -168,3 +202,29 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index ac7d6fd1a0e..e8831f90213 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace phi {
@@ -103,4 +106,157 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
   }
 }
 
+/*
+******************************
+    Divide Grad
+******************************
+*/
+
+template <typename T>
+struct DivGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
+};
+
+template <typename T>
+struct DivGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout / y_conj;
+  }
+};
+
+template <typename T>
+struct DivGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return -dout * out / y;
+  }
+};
+
+template <typename T>
+struct DivGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> out_div_y_conj((out / y).real, -(out / y).imag);
+    return -dout * out_div_y_conj;
+  }
+};
+
+template <typename T>
+struct DivDoubleDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return y * out * dout - x * dout;
+  }
+};
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout) {
+  if (dy) {
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+  }
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  // ddX_safe == null ? 0 : ddX
+  // ddY_safe == null ? 0 : ddY
+  DenseTensor ddX_safe, ddY_safe;
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddY_safe);
+
+  // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+  // dY = Out * dX * ddY / Y - dX * ddX / Y
+  // dOut = - dX * ddY
+  // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
+  // inplace ddx
+  DenseTensor tmp;
+  if (dout) {
+    tmp = *dout;
+  } else {
+    tmp.Resize(out.dims());
+    dev_ctx.template Alloc<T>(&tmp);
+  }
+  if (dy) {
+    // dX_div_Y = dX / Y;
+    DenseTensor dX_div_Y = tmp;
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, dx, y, &dX_div_Y, axis);
+
+    // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
+    // first output tensor is nullptr, the branch to calculate first
+    // output tensor will not be activated, DivGradDx function will not
+    // be called and can be ignored, the first branch has little effect
+    // on running speed.
+
+    // dY = Out * dX * ddY / Y - dX * ddX / Y
+    phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
+        dev_ctx,
+        ddX_safe,
+        ddY_safe,
+        out,
+        dX_div_Y,
+        axis,
+        nullptr,
+        dy,
+        DivGradDX<T>(),
+        DivDoubleDY<T>());
+  }
+
+  if (ddout) {
+    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, out, ddY_safe, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::SubtractFunctor<T>,
+                                      funcs::InverseSubtractFunctor<T>>(
+        dev_ctx, ddX_safe, tmp, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, tmp, y, ddout, axis);
+  }
+
+  if (dout) {
+    // dOut = - dX * ddY
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dx, ddY_safe, dout, axis);
+    auto& place = *dev_ctx.eigen_device();
+    auto dout_result = phi::EigenVector<T>::Flatten(*dout);
+    dout_result.device(place) = static_cast<T>(-1) * dout_result;
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 8b17d8bd250..a5d3f51e544 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -208,6 +208,7 @@ PD_REGISTER_KERNEL(divide,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(multiply,
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 89846ea0563..d4a25866907 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -106,6 +106,22 @@ KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
       "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
 }
 
+KernelSignature ElementwiseDivGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_double_grad",
+                         {"Y", "Out", "DX", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("Y"), "DOut", "DDOut"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -117,6 +133,8 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -136,3 +154,7 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad,
                            phi::ElementwiseSubDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
+                           phi::ElementwiseDivGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
+                           phi::ElementwiseDivDoubleGradOpArgumentMapping);
-- 
GitLab


From 6fd96a0400e5e618795ad20f8e85a2e975ea4194 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 7 Mar 2022 15:41:27 +0800
Subject: [PATCH 157/272] Add mlir trt engine type. (#40197)

* infrt add trt engine

* update engine name
---
 .../backends/tensorrt/test_trt_engine.cc      |  8 ++---
 paddle/infrt/backends/tensorrt/trt_engine.cc  | 26 ++++++++---------
 paddle/infrt/backends/tensorrt/trt_engine.h   | 11 +++++--
 paddle/infrt/backends/tensorrt/trt_utils.h    |  9 +++---
 .../dialect/tensorrt/trt_dilaect_types.h      | 29 +++++++++++++++++++
 paddle/infrt/dialect/tensorrt/trt_op_base.td  |  3 ++
 paddle/infrt/dialect/tensorrt/trt_ops.cc      | 25 ++++++++++++++++
 paddle/infrt/dialect/tensorrt/trt_ops.h       |  5 +++-
 8 files changed, 91 insertions(+), 25 deletions(-)
 create mode 100644 paddle/infrt/dialect/tensorrt/trt_dilaect_types.h

diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 54b7bc3e8af..12cf14060e2 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -17,8 +17,8 @@
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -86,7 +86,7 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
 TEST(trt, run_static) {
-  TRTEngine static_trt_engine(0);
+  TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
       static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
   BuildOptions static_build_options;
@@ -164,7 +164,7 @@ TEST(trt, run_static) {
 }
 
 TEST(trt, run_dynamic) {
-  TRTEngine engine(0);
+  TrtEngine engine(0);
   auto net = ConstructNetwork(
       engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
   BuildOptions build_options;
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a204fe42b45..232653e8c41 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -17,7 +17,7 @@
 
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
+#include <glog/logging.h>
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
@@ -40,26 +40,26 @@ static nvinfer1::IRuntime* createInferRuntime(
       phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
 }
 
-TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
   FreshDeviceId();
   logger_.reset(new TrtLogger());
   builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
   phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
 }
 
-nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
   CHECK_NOTNULL(builder_);
   return builder_.get();
 }
 
-void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+void TrtEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
                       const BuildOptions& build_options) {
   FreshDeviceId();
   ModelToBuildEnv(std::move(network), build_options);
   CHECK_NOTNULL(engine_);
 }
 
-bool TRTEngine::ModelToBuildEnv(
+bool TrtEngine::ModelToBuildEnv(
     TrtUniquePtr<nvinfer1::INetworkDefinition> network,
     const BuildOptions& build) {
   CHECK_NOTNULL(builder_);
@@ -70,7 +70,7 @@ bool TRTEngine::ModelToBuildEnv(
   return true;
 }
 
-bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+bool TrtEngine::NetworkToEngine(const BuildOptions& build) {
   TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
   CHECK_NOTNULL(config);
   CHECK(SetupNetworkAndConfig(build, *network_, *config));
@@ -91,7 +91,7 @@ bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
   return true;
 }
 
-bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
                                       INetworkDefinition& network,
                                       IBuilderConfig& config) {
   builder_->setMaxBatchSize(build.max_batch);
@@ -235,7 +235,7 @@ bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
-bool TRTEngine::SetUpInference(
+bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
     const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
     std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
@@ -261,7 +261,7 @@ bool TRTEngine::SetUpInference(
   return true;
 }
 
-void TRTEngine::Run(const phi::GPUContext& ctx) {
+void TrtEngine::Run(const phi::GPUContext& ctx) {
   if (is_dynamic_shape_) {
     DynamicRun(ctx);
   } else {
@@ -269,7 +269,7 @@ void TRTEngine::Run(const phi::GPUContext& ctx) {
   }
 }
 
-void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -303,7 +303,7 @@ void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
       runtime_batch, buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -339,14 +339,14 @@ void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
   contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::FreshDeviceId() {
+void TrtEngine::FreshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
   CHECK_LT(device_id_, count);
   phi::backends::gpu::SetDeviceId(device_id_);
 }
 
-void TRTEngine::GetEngineInfo() {
+void TrtEngine::GetEngineInfo() {
 #if IS_TRT_VERSION_GE(8200)
   LOG(INFO) << "====== engine info ======";
   std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index f72bdaf3ac0..3c8243e3c38 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -56,13 +56,18 @@ using namespace nvinfer1;  // NOLINT
 //
 // We have encapsulated this logic, please use the following programming model.
 //
-// TRTEngine trt_engine;
+// TrtEngine trt_engine;
 // trt_engine.Build(...);
 // trt_engine.SetUpInference(...);
 // trt_engine.Run(...);
-class TRTEngine {
+class TrtEngine {
  public:
-  explicit TRTEngine(int device_id);
+  explicit TrtEngine(int device_id = 0);
+
+  TrtEngine(const TrtEngine&) = delete;
+  TrtEngine& operator=(const TrtEngine&) = delete;
+  TrtEngine(TrtEngine&&) = default;
+  TrtEngine& operator=(TrtEngine&&) = default;
 
   nvinfer1::IBuilder* GetTrtBuilder();
 
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
index 4b129af1d53..c66a850ffb1 100644
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -15,16 +15,17 @@
 
 #pragma once
 
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include <glog/logging.h>
+
 #include <algorithm>
 #include <cassert>
 #include <functional>
 #include <memory>
 #include <unordered_map>
 
-#include <NvInfer.h>
-#include <NvInferRuntime.h>
-#include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h b/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h
new file mode 100644
index 00000000000..efcf7dd5be1
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/Types.h"
+
+namespace infrt {
+namespace trt {
+
+class EngineType
+    : public mlir::Type::TypeBase<EngineType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td
index 5722f17d597..128960ee03e 100755
--- a/paddle/infrt/dialect/tensorrt/trt_op_base.td
+++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td
@@ -27,6 +27,9 @@ class TRT_PaddleAttr <string name, string description> :
       Attr<CPred<"$_self.isa<mlir::trt::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
+def TRT_EngineType :
+      Type<CPred<"$_self.isa<::infrt::trt::EngineType>()">, "!trt.engine">,
+      BuildableType<"getType<::infrt::trt::EngineType>()">;
 
 //===----------------------------------------------------------------------===//
 // PaddlePaddle type definitions
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 35b7967892c..f179939e232 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -13,23 +13,48 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/OpImplementation.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/tensorrt/trt_dilaect_types.h"
 
 namespace infrt {
 namespace trt {
 
 TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
     : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
+  addTypes<EngineType>();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
 }
 
+mlir::Type TensorRTDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+  // parse trt dilaect types, for example: !trt.engine
+  if (keyword == "engine") {
+    return infrt::trt::EngineType::get(getContext());
+  }
+  parser.emitError(parser.getCurrentLocation(), "unknown infrt::trt type: ")
+      << keyword;
+  return mlir::Type();
+}
+
+void TensorRTDialect::printType(mlir::Type type,
+                                mlir::DialectAsmPrinter &printer) const {
+  // print trt dilaect types, for example: !trt.engien
+  if (type.isa<infrt::trt::EngineType>()) {
+    printer << "engine";
+    return;
+  }
+  llvm_unreachable("unknown infrt::trt type.");
+}
+
 }  // namespace trt
 }  // namespace infrt
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 95b2ed41fdf..978b9906e5f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -35,8 +35,11 @@ namespace trt {
 
 class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext *context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const;  // NOLINT
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const;  // NOLINT
 };
 
 }  // namespace trt
-- 
GitLab


From 7296433504eae988cadf198bd9e4aaccde73d8aa Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Mon, 7 Mar 2022 16:33:59 +0800
Subject: [PATCH 158/272] [phi] move is_empty to phi (#39919)

* Add is_empty

* fixed for CI

* fixed code style

* resolve conflict

* deal with comments

* replace pt by pd
---
 paddle/fluid/operators/is_empty_op.cc    | 20 ++++-----
 paddle/fluid/operators/is_empty_op.cu.cc | 23 ----------
 paddle/phi/infermeta/unary.cc            |  6 +++
 paddle/phi/infermeta/unary.h             |  2 +
 paddle/phi/kernels/is_empty_kernel.cc    | 53 ++++++++++++++++++++++++
 paddle/phi/kernels/is_empty_kernel.h     | 24 +++++++++++
 6 files changed, 92 insertions(+), 36 deletions(-)
 delete mode 100644 paddle/fluid/operators/is_empty_op.cu.cc
 create mode 100644 paddle/phi/kernels/is_empty_kernel.cc
 create mode 100644 paddle/phi/kernels/is_empty_kernel.h

diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 2750367dc77..c835bb3cf60 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/is_empty_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty");
-    ctx->SetOutputDim("Out", {1});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto *x = ctx.Input<framework::LoDTensor>("X");
@@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor,
+                            PD_INFER_META(phi::IsEmptyInferMeta));
 REGISTER_OPERATOR(
     is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsEmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc
deleted file mode 100644
index 3c256503baf..00000000000
--- a/paddle/fluid/operators/is_empty_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 85db1547f16..b9eb5196b1e 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <set>
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
@@ -307,6 +308,11 @@ void InferMetaFromVecValue(const MetaTensor& x,
   }
 }
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index d4e21fbd824..37b17f6e3d1 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -69,6 +69,8 @@ void InferMetaFromVecValue(const MetaTensor& x,
                            const std::vector<int64_t>& shape,
                            MetaTensor* out);
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
new file mode 100644
index 00000000000..26c2f978005
--- /dev/null
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/is_empty_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   DenseTensor* out) {
+  // Note: is_empty is always executed on CPU and the output data should
+  // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
+  // avoid the unnecessary data transform.
+  bool* out_data = dev_ctx.template HostAlloc<bool>(out);
+  out_data[0] = phi::product(x.dims()) == 0;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(is_empty,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(is_empty,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/is_empty_kernel.h b/paddle/phi/kernels/is_empty_kernel.h
new file mode 100644
index 00000000000..3bcf6f9054e
--- /dev/null
+++ b/paddle/phi/kernels/is_empty_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
-- 
GitLab


From 2a3d9eca64b0312a6bf49ffe6f470a084886bbe4 Mon Sep 17 00:00:00 2001
From: Ming-Xu Huang <mingh@nvidia.com>
Date: Mon, 7 Mar 2022 16:38:21 +0800
Subject: [PATCH 159/272] cuBlasLt Epilogue To Fuse Linear + ReLU|GeLU (#39437)

* Added cuBlasLtHandle_t to device context.

* Added fused_gemm_epilogue op.

1. Added fused_gemm_epilogue op to leverage cuBlastLt Epilogue.
2. Support fusion Act(X*Y + bias), X'dims >=2 and Y'dims shoule be 2.
2. Act currently only be supported ReLU. (Will add GeLU in the future).

* Added UT to fused_gemm_epilogue op.

* Added LinearAct Pattern

1. Added LinearAct into graph_pattern_detector.* to define (2.)'s
pattern.
2. LinearAct is used to detect act(element_add(matmul_v2(x, w), bias)).
3. act currently only support ReLU (Will support GeLU in the future).

* Added FuseGemmEpiloguePass

1, Added FuseGemmEpiloguePass to handle nn.Linear + Act{ReLU}
fusion (GeLU will be supported in the future).
2. Only support matmul_v2 from nn.Linear.

* Added pybind to BuildStrageter.fuse_gemm_epilogue_.

* Added UT for fuse_gemm_epilogue_pass.

* GeLU support and EpilogueSingleton

1. Added GeLU support to fused_gemm_epilogue op.
2. Added EpilogueSingleton to cache auxiliary pointer.
3. Added related UTs.

* Rename cublaslt_epilogue_opto gemm_epilogue_op.*.

* Added both train and infer pattern to LinearAct.

1. Added support of fwd graph with grap_ops linking to LinearAct.
2. Added related changes to fuse_gemm_epilogue_pass for above
modification.

* Changed CUDA requirement from 11.4 to 11.6 for fuse_gemm_epilogue_pass.

* Added identity activation support to gemm_epilogue_op.

* Added Linear Fusion (matmul_v2 + ele_add)

1. Added matmul_v2 + ele_add pattern to LinearActPattern.
2. Added matmul_v2 + ele_add support to fuse_gemm_epilogue_pass.

* Rename gemm_epilogue_op.* to fused_gemm_epilogue_op.*

* Add fused_gemm_epilogue_grad op.

1. Added fused_gemm_epilogue_grad to support backward epilogue fusion.

* Add UTs to fused_gemm_epilogue_grad_op.

* Change attribute name in fused_gemm_epilogue_grad_op for clearing.

* Allow DX and DBias be dispensable to fused_gemm_epilogue_grad op.

* Added ElementwiseAdd+Matmul+Act graph pattern detection.

* Fuse backward of Linear( Act(x))

1. Added backward fusion pass to Linear( Act(x)).
2. Added backward fusion pass to Linear(x).

* Added UTs to backward fusion of Linear(Act(x)).

* Complete document of arguments to fused_gemm_epilogue_op.

* Made arguments of some functions pass by reference.

* Modify code with review comments.

1. Made arguments of some function pass by reference.
2. Removed redundant code.
3. Followed Google code style to change code.

* Made 'const' code style be consistent

* Fixed random seed of python UTs.

* Set Compiling constrains to cuBlasLt

1. Require CUDA 11.6+
2. Remove fuse_gemm_epilogue related tests when CUDA < 11.6.

* Code Reivew from Paddle

1. Changed arguments name is_first_gemm to without_x_gradient for
clearing.
2. Applied PADDLE_THROW in fused_gemm_epilogue_op.

* Remove EpilogueSingleton

1. Applied ReserveSpace to replace Epilogue for passing auxiliary
pointers between FWD and BWD.

* Fix a logical error and enhance UTs.

1. Added act op count checking in UTs.
2. Fix issue to fuse backward or ReLU(Linear(X)).
3. TODO: solve GELU fusion issues.

* Fix Linear and GeLU fusion issues.

1. Modified graph_detech_pattern to fit with both linear wiht gelu or
relu.
2. Modified data range in Uts to allow negative values.

* Removed fused_gemm_epilogue_op.h.

* Rename namespace pten to phi.

* Rename name of arguments in fused_gemm_epilogue_op

1. bias -> Bias.
2. out -> Out.
3. reserve_space -> ReserveSpace.

* Change EpiloguePassActivationCache as local variable.

1. Removed singleton in EpiloguePassActivationCache.
2. Made EpiloguePassActivationCache as an argument to each pass
functions.
---
 cmake/operators.cmake                         |  10 +-
 paddle/fluid/framework/details/CMakeLists.txt |   2 +-
 .../fluid/framework/details/build_strategy.cc |   9 +
 .../fluid/framework/details/build_strategy.h  |   3 +
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../framework/ir/fuse_gemm_epilogue_pass.cc   | 471 ++++++++++++++++++
 .../framework/ir/fuse_gemm_epilogue_pass.h    | 100 ++++
 .../framework/ir/graph_pattern_detector.cc    | 178 ++++++-
 .../framework/ir/graph_pattern_detector.h     |  59 +++
 paddle/fluid/operators/fused/CMakeLists.txt   |   7 +-
 .../operators/fused/fused_gemm_epilogue_op.cc | 353 +++++++++++++
 .../operators/fused/fused_gemm_epilogue_op.cu | 376 ++++++++++++++
 .../platform/device/gpu/cuda/cuda_helper.h    |  24 +
 paddle/fluid/platform/device/gpu/gpu_types.h  |   6 +
 paddle/fluid/platform/device_context.cc       |  22 +
 paddle/fluid/platform/device_context.h        |  31 ++
 paddle/fluid/pybind/pybind.cc                 |  26 +
 paddle/phi/backends/gpu/forwards.h            |   4 +
 paddle/phi/backends/gpu/gpu_context.cc        |  32 ++
 paddle/phi/backends/gpu/gpu_context.h         |   6 +
 paddle/phi/backends/gpu/gpu_decls.h           |   5 +
 .../fluid/tests/unittests/CMakeLists.txt      |  11 +
 .../unittests/test_fuse_gemm_epilogue_pass.py | 392 +++++++++++++++
 .../test_fused_gemm_epilogue_grad_op.py       | 239 +++++++++
 .../unittests/test_fused_gemm_epilogue_op.py  | 450 +++++++++++++++++
 tools/static_mode_white_list.py               |   3 +
 26 files changed, 2788 insertions(+), 32 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
 create mode 100644 paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
 create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
 create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7affd59de16..9e8c81c2985 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -293,11 +293,11 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op"
     "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
-    
-            if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-                set(pybind_flag 1)
-            endif()
-        endforeach()
+
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 66dfb81755f..948eaab40b4 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass)
+    fix_op_run_order_pass fuse_gemm_epilogue_pass)
 
 if (WITH_CINN)
   set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index c99200ec98a..fdf74d2f769 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
+
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
+                        "fuse_gemm_epilogue_pass");
+#endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
     // for single card training, fuse_all_reduce_ops is unnecessary.
@@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass);
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+USE_PASS(fuse_gemm_epilogue_pass);
+#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 70a083dd70b..5eb584aaefa 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -124,6 +125,8 @@ struct BuildStrategy {
   paddle::optional<bool> fuse_broadcast_ops_{paddle::none};
   // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
+  // Fuse GEMM+Epilogue via cublasLt epilogue.
+  bool fuse_gemm_epilogue_{false};
 
   // mkldnn_enabled_op_types specify the operator type list to
   // use MKLDNN acceleration. It is null in default, means
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0d53a54ff82..a1f2d6edca6 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -157,6 +157,7 @@ endif()
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
new file mode 100644
index 00000000000..f48224cbdc2
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -0,0 +1,471 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h"
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const {
+  EpiloguePassActivationCache cache;
+
+  graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache);
+  graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache);
+  graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache);
+  graph = FuseLinearFwd(graph, false);
+  graph = FuseLinearFwd(graph, true);
+  graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache);
+  graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache);
+  graph = FuseLinearBwd(graph, false);
+  graph = FuseLinearBwd(graph, true);
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
+                                               bool is_training) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, {}, is_training, false);
+
+  int found_linear_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    std::string activation = "none";
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, ele_out);
+
+    GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name();
+    found_linear_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types,
+    bool is_training, bool is_act_grad_x_from_act,
+    EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act);
+
+  int found_linear_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    auto activation = act_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, act_out);
+
+    // Only need to check weight.shape[1] for auxiliary pointer
+    // and mark it the act op is fused for backward epilogue fusion.
+    // That because cuBlasLt epilogue's restriction.
+    if (is_training) {
+      int divisor_of_n = activation == "relu" ? 128 : 8;
+      if (matmul_w_shape[1] % divisor_of_n) return;
+
+      VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace"));
+      auto *reserve_space_node = g->CreateVarNode(&reserve_space);
+
+      cache->InsertFusedActivation(
+          GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()),
+          reserve_space_node);
+
+      gemm_epilogue_node->Op()->SetOutput("ReserveSpace",
+                                          {reserve_space_node->Name()});
+
+      if (!is_act_grad_x_from_act) {
+        GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern);
+        act_grad_op->Op()->RenameInput(ele_out->Name(),
+                                       reserve_space_node->Name());
+        IR_NODE_LINK_TO(reserve_space_node, act_grad_op);
+      }
+      IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node);
+    }
+
+    GraphSafeRemoveNodes(g,
+                         {matmul_op, matmul_out, ele_add_op, ele_out, act_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> "
+            << act_out->Name();
+    found_linear_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
+                                               bool without_x_gradient) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+
+    Node *matmul_grad_dx = nullptr;
+    if (!without_x_gradient) {
+      GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx,
+                                ele_add_matmul_act_pattern);
+      matmul_grad_dx = matmul_grad_dx_ptr;
+    }
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    std::string activation_grad = "none";
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    if (matmul_grad_dx) {
+      fused_gemm_epilogue_grad_op_desc.SetOutput("DX",
+                                                 {matmul_grad_dx->Name()});
+    }
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    if (matmul_grad_dx) {
+      IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx);
+    }
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op});
+
+    std::string matmul_grad_dx_name =
+        matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " ";
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_w->Name() << " and " << matmul_grad_dx_name;
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+    bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, act_grad_types, false,
+                             is_act_grad_x_from_act);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx,
+                              ele_add_matmul_act_pattern);
+
+    auto key =
+        GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId());
+    if (!cache->HasFusedActivation(key)) {
+      return;
+    }
+    auto *reserve_space_node = cache->GetFusedActivationSpace(key);
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    auto activation_grad = act_grad_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace",
+                                              {reserve_space_node->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node);
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op,
+                             matmul_grad_dx, act_grad_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name()
+            << "\n\t " << matmul_grad_dx->Name() << " -> "
+            << act_grad_op->Name() << " -> " << act_grad_dx->Name();
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+bool FuseGemmEpiloguePass::IsGemmFromLinear_(
+    const std::vector<int64_t> &x_shape, const std::vector<int64_t> &w_shape,
+    OpDesc *matmul_v2_op) const {
+  if (w_shape.size() != 2 || x_shape.size() < 2) return false;
+  for (auto attr_name :
+       {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y",
+        "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) {
+    if (matmul_v2_op->HasAttr(attr_name)) {
+      std::vector<int> tmp_vec =
+          BOOST_GET_CONST(std::vector<int>, matmul_v2_op->GetAttr(attr_name));
+      if (tmp_vec.size() > 0) return false;
+    }
+  }
+  if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) ||
+      BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y")))
+    return false;
+
+  return true;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_gemm_epilogue_pass,
+              paddle::framework::ir::FuseGemmEpiloguePass);
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
new file mode 100644
index 00000000000..575ffee73d6
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the ElewiseAdd and activation
+ */
+class Graph;
+class Node;
+
+class EpiloguePassActivationCache {
+ public:
+  EpiloguePassActivationCache() {}
+
+  EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete;
+  void operator=(const EpiloguePassActivationCache &) = delete;
+
+  bool HasFusedActivation(const std::string &key) const {
+    return fused_activation_space_map_.count(key);
+  }
+
+  ir::Node *GetFusedActivationSpace(const std::string &key) {
+    if (HasFusedActivation(key)) {
+      return fused_activation_space_map_.find(key)->second;
+    }
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The key (%d) of EpiloguePassActivationCache does not exist.", key));
+  }
+
+  void InsertFusedActivation(const std::string &key, ir::Node *const value) {
+    if (!HasFusedActivation(key)) {
+      mtx.lock();
+      fused_activation_space_map_.insert({key, value});
+      mtx.unlock();
+    } else {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "The key (%d) of EpiloguePassActivationCache already exist.", key));
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, ir::Node *> fused_activation_space_map_;
+  std::mutex mtx;
+};
+
+class FuseGemmEpiloguePass : public FusePassBase {
+ public:
+  virtual ~FuseGemmEpiloguePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const;
+  ir::Graph *FuseLinearActFwd(ir::Graph *graph,
+                              const std::unordered_set<std::string> &act_types,
+                              bool is_training, bool is_act_grad_x_from_act,
+                              EpiloguePassActivationCache *cache) const;
+  ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const;
+  ir::Graph *FuseLinearActBwd(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+      bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const;
+
+ private:
+  bool IsGemmFromLinear_(const std::vector<int64_t> &x_shape,
+                         const std::vector<int64_t> &w_shape,
+                         OpDesc *matmul_v2_op) const;
+  const std::string GetReserveSpaceCacheKey(const std::string var_name,
+                                            int block_id) const {
+    return std::to_string(block_id) + var_name;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index e4c9dc72128..d7d866fa98b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1461,31 +1461,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()(
   return bn_grad;
 }
 
-PDNode *patterns::ElewiseAddAct::operator()(
-    paddle::framework::ir::PDNode *ele_x_var,
-    std::unordered_set<std::string> act_types) {
-  auto *ele_y_var = pattern->NewNode(ele_y_repr())
-                        ->assert_is_op_input("elementwise_add", "Y");
-
-  auto *ele_add =
-      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
-
-  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
-                          ->assert_is_op_output("elementwise_add", "Out");
-
-  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
-
-  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
-
-  auto *act_out_var =
-      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
-
-  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
-  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
-
-  return act_out_var;
-}
-
 PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
     paddle::framework::ir::PDNode *d_act_out_var,
     std::unordered_set<std::string> act_types) {
@@ -1526,6 +1501,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
+PDNode *patterns::ElewiseAddAct::operator()(
+    paddle::framework::ir::PDNode *ele_x_var,
+    std::unordered_set<std::string> act_types) {
+  auto *ele_y_var = pattern->NewNode(ele_y_repr())
+                        ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+
+  auto *act_out_var =
+      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
+
+  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
+  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+  return act_out_var;
+}
+
+PDNode *patterns::LinearAct::operator()(
+    paddle::framework::ir::PDNode *linear_x_var,
+    const std::unordered_set<std::string> &act_types, bool with_grad_link,
+    bool is_act_grad_x_from_act) {
+  auto *matmul_w_var =
+      pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+
+  auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2");
+
+  auto *matmul_out_var = pattern->NewNode(matmul_out_repr())
+                             ->assert_is_op_output("matmul_v2", "Out");
+
+  matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X");
+
+  auto *ele_bias_var = pattern->NewNode(ele_bias_repr())
+                           ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var});
+  ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var});
+
+  if (with_grad_link) {
+    matmul_out_var->assert_is_op_input("elementwise_add_grad", "X");
+    auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad")
+                                        ->assert_is_op("elementwise_add_grad");
+    elementwise_add_grad_op->LinksFrom({matmul_out_var});
+  }
+
+  if (act_types.size() > 0) {
+    ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+    auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+    auto *act_out_var = pattern->NewNode(act_out_repr())
+                            ->assert_is_ops_output(act_types, "Out");
+
+    act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+    if (with_grad_link && !is_act_grad_x_from_act) {
+      std::unordered_set<std::string> act_grad_types;
+      for (const auto &act : act_types) {
+        std::string act_grad(act);
+        act_grad.append("_grad");
+        act_grad_types.insert(act_grad);
+      }
+
+      ele_out_var->assert_is_ops_input(act_grad_types, "X");
+      auto *act_grad_op =
+          pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+      act_grad_op->LinksFrom({ele_out_var});
+    }
+
+    return act_out_var;
+  }
+
+  return ele_out_var;
+}
+
+PDNode *patterns::ElewiseAddMatmulAct::operator()(
+    paddle::framework::ir::PDNode *dout_var,
+    const std::unordered_set<std::string> &act_grad_types,
+    bool without_x_gradient, bool is_act_grad_x_from_act) {
+  auto *ele_grad_bias_var =
+      pattern->NewNode(ele_grad_bias_repr())
+          ->assert_is_op_input("elementwise_add_grad", "Y");
+  auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr())
+                           ->assert_is_op("elementwise_add_grad");
+  auto *ele_grad_dx_var =
+      pattern->NewNode(ele_grad_dx_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
+  auto *ele_grad_dbias_var =
+      pattern->NewNode(ele_grad_dbias_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("Y"));
+  ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var})
+      .LinksTo({ele_grad_dx_var, ele_grad_dbias_var});
+
+  ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad",
+                                                        GradVarName("Out"));
+
+  auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "X");
+  auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "Y");
+  auto *matmul_grad =
+      pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad");
+  auto *matmul_grad_dx_var =
+      pattern->NewNode(matmul_grad_dx_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("X"));
+  auto *matmul_grad_dw_var =
+      pattern->NewNode(matmul_grad_dw_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("Y"));
+  matmul_grad->LinksFrom(
+      {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var});
+  if (without_x_gradient) {
+    matmul_grad->LinksTo({matmul_grad_dw_var});
+  } else {
+    matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var});
+  }
+
+  if (!without_x_gradient && act_grad_types.size() > 0) {
+    matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input(
+        act_grad_types, GradVarName("Out"));
+
+    auto *act_grad =
+        pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+    auto *act_grad_dx_var =
+        pattern->NewNode(act_grad_dx_repr())
+            ->assert_is_ops_output(act_grad_types, GradVarName("X"));
+
+    auto *act_grad_x_var = matmul_grad_x_var;
+    if (!is_act_grad_x_from_act) {
+      auto *ele_out_var = pattern->NewNode(ele_out_repr())
+                              ->assert_is_ops_input(act_grad_types, "X");
+      act_grad_x_var = ele_out_var;
+    }
+
+    act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var})
+        .LinksTo({act_grad_dx_var});
+    return act_grad;
+  }
+
+  return matmul_grad;
+}
+
 // conv_type: conv2d, conv3d, conv2d_transpose
 PDNode *patterns::ConvBias::operator()(
     paddle::framework::ir::PDNode *conv_input, std::string conv_type) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d6400ed6945..0f21906d08d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -863,6 +863,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(ele_y);
 };
 
+// The following patterns are used to fuse linear and act (ReLu or GeLU)
+// formula: act(F.linear(x))
+// op: matmul_v2 + elementwise_add + act
+// named nodes: matmul, elementwise_add, act
+//              matmul_w, matmul_out
+//              ele_bias, elewise_add_out, act_out
+struct LinearAct : public PatternBase {
+  LinearAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "linear_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_types,
+                     bool with_grad_link, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(matmul);
+  PATTERN_DECL_NODE(ele_add);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(matmul_w);
+  PATTERN_DECL_NODE(matmul_out);
+  PATTERN_DECL_NODE(elewise_add_out);
+  PATTERN_DECL_NODE(ele_bias);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// The following patterns are used to fuse linear_grad and act_grad (ReLu or
+// GeLU)
+// formula: the backward of F.linear( act(x) )
+// op: elementwise_add_grad + matmul_v2_grad + act_grad
+// named nodes: ele_add_grad, matmul_grad, act_grad
+//              ele_grad_bias, ele_grad_dx, ele_grad_dbias
+//              matmul_grad_x, matmul_grad_dx, matmul_grad_dx
+//              matmul_grad_dw, act_grad_dx
+struct ElewiseAddMatmulAct : public PatternBase {
+  ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_grad_types,
+                     bool without_x_gradient, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_add_grad);
+  PATTERN_DECL_NODE(matmul_grad);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(ele_out);
+  PATTERN_DECL_NODE(ele_grad_bias);
+  PATTERN_DECL_NODE(ele_grad_dx);
+  PATTERN_DECL_NODE(ele_grad_dbias);
+  PATTERN_DECL_NODE(matmul_grad_x);
+  PATTERN_DECL_NODE(matmul_grad_w);
+  PATTERN_DECL_NODE(matmul_grad_dx);
+  PATTERN_DECL_NODE(matmul_grad_dw);
+  PATTERN_DECL_NODE(act_grad_dx);
+};
+
 // Conv with Elementwise_add as bias
 // op: conv + elementwise_add
 // named nodes:
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 67287afa6ae..80e7f5c001d 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -19,7 +19,8 @@ register_operators(EXCLUDES
     fused_attention_op
     fused_transformer_op
     fused_feedforward_op
-    resnet_unit_op)
+    resnet_unit_op
+    fused_gemm_epilogue_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM)
         cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
         cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
     endif()
+
+    if (CUDA_VERSION GREATER_EQUAL 11.6)
+        op_library(fused_gemm_epilogue_op)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
new file mode 100644
index 00000000000..4c4e3661e6d
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -0,0 +1,353 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias",
+                   "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedGemmEpilogueOp");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto bias_dims = ctx->GetInputDim("Bias");
+
+    auto trans_x = ctx->Attrs().Get<bool>("trans_x");
+    auto trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        bias_dims.size(), 1,
+        platform::errors::InvalidArgument(
+            "The Input tensor bias's dimension of FusedGemmEpilogueOp "
+            " should be == 1, but got %d.",
+            bias_dims.size()));
+
+    PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The Input tensor bias's dimension 0"
+                          " should be == Y[-1], but got bias's shape = [%s] "
+                          "and Y's shape = [%s]",
+                          bias_dims, y_dims));
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
+
+    int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1];
+    int K_from_y = trans_y ? y_dims[1] : y_dims[0];
+
+    PADDLE_ENFORCE_EQ(
+        K_from_x, K_from_y,
+        platform::errors::InvalidArgument(
+            "The last dimension of X should be equal with Y's first dimension."
+            "But received X[-1] = [%d], Y[0] = [%d].",
+            K_from_x, K_from_y));
+
+    auto activation = ctx->Attrs().Get<std::string>("activation");
+
+    if ((activation != "relu") && (activation != "gelu") &&
+        (activation != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+
+    if (activation == "none" && ctx->HasOutput("ReserveSpace")) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The ReserveSpace would not be used when activation = \"none\""));
+    }
+
+    // cublasLt's restriction for auxiliary.
+    if (ctx->HasOutput("ReserveSpace") && activation != "none") {
+      int min_size_of_n = activation == "relu" ? 128 : 8;
+      int N_size = trans_y ? y_dims[0] : y_dims[1];
+      PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0,
+                        platform::errors::InvalidArgument(
+                            "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) "
+                            "should be multiple of %d when auxiliary_key given "
+                            "and activation=%s, but got N = %d.",
+                            min_size_of_n, activation, N_size));
+    }
+
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(static_cast<size_t>(x_dims.size()));
+    if (trans_x) {
+      for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]);
+    } else {
+      for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]);
+    }
+
+    if (trans_y) {
+      out_dims.push_back(y_dims[0]);
+    } else {
+      out_dims.push_back(y_dims[1]);
+    }
+
+    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    // Note (Ming Huang): Reserve space of relu is a bit-mask,
+    // which cannot pass nan_and_inf checking if shape is set.
+    if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
+      ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias).");
+    AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias).");
+    AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias).");
+
+    AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias).");
+    AddOutput("ReserveSpace",
+              R"DOC(Reserve GPU space to place 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable()
+        .AsExtra();
+
+    AddAttr<bool>(
+        "trans_x",
+        R"DOC((bool, default false), Whether to transpose input tensor X 
+    or not. The input tensor X coulbe be more than two dimension. When 
+    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+    [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trans_y",
+        R"DOC((bool, default false), Whether to transpose input tensor Y 
+    or not. The input tensor Y should be two dimension. When 
+    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+    [d0, d1] -> [d1, d0].)DOC")
+        .SetDefault(false);
+
+    AddAttr<std::string>(
+        "activation",
+        R"DOC((string, default none), The activation function. It could be 
+    one of {none, relu, gelu}. When none is given, Act would be null 
+    operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogue Operator
+This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)).
+It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU).
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut",
+                   "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp");
+
+    auto dout_dims = ctx->GetInputDim("DOut");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_GE(
+        dout_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            dout_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueGradOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        dout_dims.size(), x_dims.size(),
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's and X's dimension of "
+            "FusedGemmEpilogueGradOp "
+            " should be the same, but got DOut's dim = %d and X's = %d.",
+            dout_dims.size(), x_dims.size()));
+
+    auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1);
+
+    auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[1], y_dims[1],
+        platform::errors::InvalidArgument(
+            "The last dimension of DOut should be equal with Y's last"
+            "dimension. But received DOut[-1] = [%d], Y[1] = [%d].",
+            dout_mat_dims[1], y_dims[1]));
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[0], x_mat_dims[0],
+        platform::errors::InvalidArgument(
+            "The first dimension of DOut should be equal with X's first"
+            "dimension. But received DOut[0] = [%d], Y[0] = [%d].",
+            dout_mat_dims[0], x_mat_dims[0]));
+
+    auto activation_grad = ctx->Attrs().Get<std::string>("activation_grad");
+    if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") &&
+        (activation_grad != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation_grad));
+    }
+
+    if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) {
+      PADDLE_ENFORCE_EQ(true, false,
+                        platform::errors::InvalidArgument(
+                            "The ReserveSpace should not be empty. "
+                            "when activation_grad == {relu_grad, gelu_grad}."));
+    }
+
+    if (ctx->HasOutput("DX")) {
+      std::vector<int64_t> dx_dims;
+      dx_dims.reserve(static_cast<size_t>(x_dims.size()));
+      for (int i = 0; i < x_dims.size(); ++i) {
+        dx_dims.push_back(x_dims[i]);
+      }
+      ctx->SetOutputDim("DX", phi::make_ddim(dx_dims));
+    }
+
+    std::vector<int64_t> dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size());
+    ctx->SetOutputDim("DY", phi::make_ddim(dy_dims));
+
+    if (ctx->HasOutput("DBias")) {
+      std::vector<int64_t> dbias_dims;
+      dbias_dims.push_back(y_dims[1]);
+      ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("DOut",
+             "The input grad tensor to Out of Out = (Act(X) * Y) + bias");
+    AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias");
+    AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias");
+    AddInput("ReserveSpace",
+             R"DOC(A GPU space to fetch 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue_grad op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable();
+
+    AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+    AddOutput("DY",
+              "The output grad tensor to Y of Out = (Act(X) * Y) + bias.");
+    AddOutput("DBias",
+              "The output grad tensor to bias of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+
+    AddAttr<std::string>(
+        "activation_grad",
+        R"DOC((string, default none), The backward activation function. It could be 
+    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would 
+    be null operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogueGrad Operator
+This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias).
+It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear.
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
+                  ops::FusedGemmEpilogueOpMaker)
+REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp,
+                  ops::FusedGemmEpilogueGradOpMaker)
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
new file mode 100644
index 00000000000..e16c9e8f483
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* bias = ctx.Input<Tensor>("Bias");
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::string activation = ctx.Attr<std::string>("activation");
+    bool enable_auxiliary = reserve_space == nullptr ? false : true;
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto* out_data = out->data<T>();
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
+    int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
+    int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtMatmulDesc_t operation_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+        &operation_desc, compute_type, scale_type));
+    cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx,
+            sizeof(transx)));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy,
+            sizeof(transy)));
+
+    cublasLtEpilogue_t epiloque_func =
+        get_epilogue_type_(activation, enable_auxiliary);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func,
+            sizeof(epiloque_func)));
+    const T* bias_data = bias->data<T>();
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data,
+            sizeof(bias_data)));
+
+    if (enable_auxiliary && activation != "none") {
+      size_t reserve_space_size = 0;
+      if (activation == "relu") {
+        // Count in bits.
+        reserve_space_size = phi::product(out->dims()) / 8;
+      } else {
+        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+      }
+      reserve_space->mutable_data(ctx.GetPlace(), out->type(),
+                                  reserve_space_size);
+      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+              &aux_data, sizeof(aux_data)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+              sizeof(N)));
+    }
+
+    cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
+    if (trans_x)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, M, K, M));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+    if (trans_y)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, K, N, K));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &out_desc, mat_type, N, M, N));
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+    memory::allocation::AllocationPtr workspace =
+        memory::Alloc(dev_ctx, workspace_size);
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+        lt_handle, operation_desc, alpha, y->data<T>(), y_desc, x->data<T>(),
+        x_desc, beta, out_data, out_desc, out_data, out_desc, algo,
+        workspace->ptr(), workspace_size, stream));
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation,
+                                               bool enable_auxiliary) {
+    if (activation == "relu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_RELU_BIAS;
+    } else if (activation == "gelu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_GELU_BIAS;
+    } else if (activation == "none") {
+      return CUBLASLT_EPILOGUE_BIAS;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* dout = ctx.Input<Tensor>("DOut");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    Tensor* dx = ctx.Output<Tensor>("DX");
+    Tensor* dy = ctx.Output<Tensor>("DY");
+    Tensor* dbias = ctx.Output<Tensor>("DBias");
+
+    std::string activation_grad = ctx.Attr<std::string>("activation_grad");
+
+    auto dout_mat_dims =
+        phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1);
+    auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1);
+
+    int64_t M = x_mat_dims[0];
+    int64_t K = y->dims()[0];
+    int64_t N = y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    cublasOperation_t trans_dout = CUBLAS_OP_N;
+    cublasLtMatrixLayout_t dout_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &dout_desc, mat_type, N, M, N));
+
+    if (dx) {
+      cublasLtMatmulDesc_t dx_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dx_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_y = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y,
+              sizeof(trans_y)));
+      cublasLtEpilogue_t epiloque_func_for_dx =
+          get_epilogue_type_(activation_grad);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dx, sizeof(epiloque_func_for_dx)));
+
+      if (activation_grad != "none") {
+        auto* aux_data = reserve_space->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+                &aux_data, sizeof(aux_data)));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+                sizeof(N)));
+      }
+
+      cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dx_desc, mat_type, K, M, K));
+
+      memory::allocation::AllocationPtr dx_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dx->mutable_data<T>(ctx.GetPlace());
+      auto* dx_data = dx->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dx_operation_desc, alpha, y->data<T>(), y_desc,
+          dout->data<T>(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc,
+          algo, dx_workspace->ptr(), workspace_size, stream));
+    }
+
+    if (dy) {
+      cublasLtMatmulDesc_t dy_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dy_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_x = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x,
+              sizeof(trans_x)));
+      cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr
+                                                    ? CUBLASLT_EPILOGUE_DEFAULT
+                                                    : CUBLASLT_EPILOGUE_BGRADA;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dy, sizeof(epiloque_func_for_dy)));
+
+      if (dbias) {
+        dbias->mutable_data<T>(ctx.GetPlace());
+        auto* dbias_data = dbias->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                &dbias_data, sizeof(dbias_data)));
+      }
+
+      cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dy_desc, mat_type, N, K, N));
+
+      memory::allocation::AllocationPtr dy_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dy->mutable_data<T>(ctx.GetPlace());
+      auto* dy_data = dy->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dy_operation_desc, alpha, dout->data<T>(), dout_desc,
+          x->data<T>(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo,
+          dy_workspace->ptr(), workspace_size, stream));
+    }
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(
+      const std::string& activation_grad) {
+    if (activation_grad == "relu_grad") {
+      return CUBLASLT_EPILOGUE_DRELU;
+    } else if (activation_grad == "gelu_grad") {
+      return CUBLASLT_EPILOGUE_DGELU;
+    } else if (activation_grad == "none") {
+      return CUBLASLT_EPILOGUE_DEFAULT;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation_grad attribute of fused_gemm_epilogue op should "
+              "be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation_grad=%s.",
+              activation_grad));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDA_VERSION >= 11060
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue_grad,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index ab7d474c1ac..a32db3a9921 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -110,5 +111,28 @@ class CublasHandleHolder {
   mutable std::mutex mtx_;
 };
 
+class CublasLtHandleHolder {
+ public:
+  CublasLtHandleHolder() {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtCreate(&handle_));
+  }
+  const cublasLtHandle_t& GetCublasLtHandle() const { return handle_; }
+
+  ~CublasLtHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtDestroy(handle_));
+  }
+
+  inline void Call(const std::function<void(blasLtHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasLtHandleHolder);
+
+  cublasLtHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index d7362fe9cbd..d0b48eca502 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +25,7 @@
 #else
 #include <cuda_runtime.h>
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
 
@@ -70,6 +72,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f60cbc48694..18ac979b48e 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -465,6 +467,9 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
   InitCuBlasContext();
   InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   InitCuSparseContext();
   InitCuSolverContext();
 #endif
@@ -476,6 +481,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     DestoryCuDNNContext();
     DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    DestoryCuBlasLtContext();
+#endif
     DestoryCuSolverContext();
 #endif
 
@@ -485,6 +493,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     InitCuBlasContext();
     InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    InitCuBlasLtContext();
+#endif
     InitCuSolverContext();
 #endif
   }
@@ -495,6 +506,9 @@ CUDAContext::~CUDAContext() {
   DestoryCuDNNContext();
   DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   DestoryCuSparseContext();
   DestoryCuSolverContext();
 #endif
@@ -551,6 +565,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
   }
   return phi::GPUContext::cublas_handle();
 }
+#if CUDA_VERSION >= 11060
+cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CublasLtHandle()->GetCublasLtHandle();
+  }
+  return phi::GPUContext::cublaslt_handle();
+}
+#endif
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
   if (thread_ctx_.count(this)) {
     return context()->CusparseHandle()->GetCusparseHandle();
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 29b6477b683..e104170ca24 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -29,6 +31,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
@@ -332,6 +335,12 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  const std::unique_ptr<CublasLtHandleHolder>& CublasLtHandle() const {
+    return cublaslt_handle_;
+  }
+#endif
+
   const std::unique_ptr<CusparseHandleHolder>& CusparseHandle() const {
     return cusparse_handle_;
   }
@@ -348,6 +357,14 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  /*! \brief  Call cublasLt function safely. */
+  inline void CublasLtCall(
+      const std::function<void(blasLtHandle_t)>& callback) const {
+    cublaslt_handle_->Call(callback);
+  }
+#endif
+
   /*! \brief  Call cusparse function safely. */
   inline void CusparseCall(
       const std::function<void(phi::sparseHandle_t)>& callback) const {
@@ -394,6 +411,12 @@ class CUDAContext {
 #endif
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void InitCuBlasLtContext() {
+    cublaslt_handle_.reset(new CublasLtHandleHolder());
+  }
+#endif
+
   void InitCuSparseContext() {
     cusparse_handle_.reset(new CusparseHandleHolder(RawStream()));
   }
@@ -472,6 +495,10 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); }
+#endif
+
   void DestoryCuSparseContext() { cusparse_handle_.reset(); }
 #endif
 
@@ -497,6 +524,9 @@ class CUDAContext {
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  std::unique_ptr<CublasLtHandleHolder> cublaslt_handle_;
+#endif
   cusolverDnHandle_t cusolver_dn_handle_;
   std::unique_ptr<CusparseHandleHolder> cusparse_handle_;
 #endif
@@ -559,6 +589,7 @@ class CUDADeviceContext : public phi::GPUContext {
   rocblas_handle cublas_handle() const;
 #else
   cublasHandle_t cublas_handle() const;
+  cublasLtHandle_t cublaslt_handle() const;
   cusparseHandle_t cusparse_handle() const;
 #endif
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c016321ef80..0a1cf604d2e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -3440,6 +3441,31 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_elewise_add_act_ops = True
                      )DOC")
+      .def_property(
+          "fuse_gemm_epilogue",
+          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_gemm_epilogue_ = b;
+          },
+          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
+                to fuse matmul_op, elemenewist_add_op and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_gemm_epilogue = True
+                     )DOC")
       .def_property(
           "fuse_bn_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index d0787159e1e..33daa2bba6b 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,6 +57,9 @@ using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
 // Forward declaration of cuBLAS types.
 using cublasHandle_t = struct cublasContext *;
 
+// Forward declaration of cuBLASLt types.
+using cublasLtHandle_t = struct cublasLtContext *;
+
 // Forward declaration of cuSOLVER types.
 using cusolverDnHandle_t = struct cusolverDnContext *;
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index dbcc1660c64..09deb575f24 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -171,6 +172,7 @@ struct GPUContext::Impl {
     InitStream();
     InitEigenDevice();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -183,6 +185,7 @@ struct GPUContext::Impl {
     InitGpuProperties();
     InitStream();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -212,6 +215,7 @@ struct GPUContext::Impl {
     }
 #endif
     DestroyInternalBlasHandle();
+    DestroyInternalBlasLtHandle();
     DestoryInternalStream();
   }
 
@@ -418,6 +422,25 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
+  void InitBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtCreate(&blaslt_handle_);
+#endif
+  }
+
+  void DestroyInternalBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtDestroy(blaslt_handle_);
+#endif
+  }
+
+  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
+
+  blasLtHandle_t GetBlasLtHandle() const {
+    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
+    return blaslt_handle_;
+  }
+
   void InitDNNHandle() {
     if (phi::dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
@@ -679,6 +702,7 @@ struct GPUContext::Impl {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
   solverHandle_t solver_handle_{nullptr};
   sparseHandle_t sparse_handle_{nullptr};
@@ -725,6 +749,10 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
+blasLtHandle_t GPUContext::cublaslt_handle() const {
+  return impl_->GetBlasLtHandle();
+}
+
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
 }
@@ -815,6 +843,10 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
+  impl_->SetBlasLtHandle(blaslt);
+}
+
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index b9d843982dc..3eb4360ad35 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -93,6 +94,9 @@ class GPUContext : public DeviceContext {
   /*! \brief  Return cublas handle in the device context. */
   blasHandle_t cublas_handle() const;
 
+  /*! \brief  Return cublasLt handle in the device context. */
+  blasLtHandle_t cublaslt_handle() const;
+
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
 
@@ -193,6 +197,8 @@ class GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasLtHandle(blasLtHandle_t);
+
   void SetDnnHandle(dnnHandle_t);
 
   void SetSolverHandle(solverHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 0be24392e1b..4a6b9d2fd87 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,6 +60,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
 
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1443eebf293..f8102ec4080 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -125,6 +125,17 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+endif()
+
+if (WITH_GPU)
+    if (CUDA_VERSION LESS 11.6)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+    endif()
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
new file mode 100644
index 00000000000..7f3180e21d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cases for role makers."""
+
+from __future__ import print_function
+import paddle
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+
+
+def compare(ref, res, atol, rtol):
+
+    ref = np.array(ref).flatten()
+    res = np.array(res).flatten()
+
+    tmp_ref = ref.astype(np.float)
+    tol = atol + rtol * abs(tmp_ref)
+
+    diff = abs(res - ref)
+
+    indices = np.transpose(np.where(diff > tol))
+    if len(indices) == 0:
+        return True
+    return False
+
+
+def verify_node_count(graph, node_name, target_count):
+    count = 0
+    for node in graph.nodes():
+        if node.name() == node_name:
+            count += 1
+    return count == target_count
+
+
+class MultiFCLayer(paddle.nn.Layer):
+    def __init__(self, hidden, Activation):
+        super(MultiFCLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(hidden, hidden)
+        self.linear2 = paddle.nn.Linear(hidden, hidden)
+        self.linear3 = paddle.nn.Linear(hidden, hidden)
+
+        self.relu1 = Activation()
+        self.relu2 = Activation()
+        self.relu3 = Activation()
+
+    def forward(self, x, matmul_y, ele_y):
+        output = self.linear1(x)
+        output = self.relu1(output)
+        output = self.linear2(output)
+
+        output1 = paddle.matmul(output, matmul_y)
+        output = self.linear3(output)
+        output = self.relu2(output)
+
+        output = paddle.matmul(output, matmul_y)
+        output = paddle.add(output, ele_y)
+        output = self.relu3(output)
+        output = paddle.add(output, output1)
+        return output
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueFWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+        self.reference = self.exe.run(self.main_prog,
+                                      feed=self.feed,
+                                      fetch_list=[self.loss.name])
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        result = self.exe.run(program,
+                              feed=self.feed,
+                              fetch_list=[self.loss.name])
+        self.assertTrue(
+            compare(self.reference, result, self.atol, self.rtol),
+            "[{}] outputs are miss-matched.".format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        act_fwd_name = self._get_act_type()[1]
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueBWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+                paddle.static.append_backward(loss=self.loss)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+
+        self.fetch = [
+            self.loss.name,
+            '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear3.full_name())
+        ]
+        self.outs_ref = self.exe.run(self.main_prog,
+                                     feed=self.feed,
+                                     fetch_list=self.fetch)
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch)
+
+        for ref, res in zip(self.outs_ref, outs_res):
+            self.assertTrue(
+                compare(ref, res, self.atol, self.rtol),
+                "[{}] output is miss-matched.".format(type(self).__name__))
+
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3),
+            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        _, act_fwd_name, act_bwd_name = self._get_act_type()
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+        self.assertTrue(
+            verify_node_count(program._graph, act_bwd_name, 2),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_bwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 5e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu", "gelu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
new file mode 100644
index 00000000000..2ea1bf2e9cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def get_outputs(DOut, X, Y):
+    DX = np.dot(DOut, Y.T)
+    DY = np.dot(X.T, DOut)
+    DBias = np.sum(DOut, axis=0)
+
+    return DX, DY, DBias
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                    self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP32(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP64(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                   self.inputs['Y'])
+        self.outputs = {'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP32(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP64(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                               self.inputs['Y'])
+        self.outputs = {'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
new file mode 100644
index 00000000000..f826898f9e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def gelu(x):
+    y_ref = 0.5 * x * (
+        1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    return y_ref.astype(x.dtype)
+
+
+def relu(x):
+    mask = x > 0
+    return x * mask
+
+
+def get_output(X, Y, bias, act):
+    out = np.dot(X, Y) + bias
+    if act == 'relu':
+        return relu(out)
+    elif act == 'gelu':
+        return gelu(out)
+    else:
+        return out
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (-1, 4)), self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (4, -1)).T, self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'gelu'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'gelu')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'none')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 7356f0c8db0..365047f7e83 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -729,4 +730,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_lu_op',
     'test_margin_cross_entropy_op',
     'test_pull_gpups_sparse_op',
+    'test_fused_gemm_epilogue_op',
+    'test_fused_gemm_epilogue_grad_op',
 ]
-- 
GitLab


From 1c29196e8de08edc18dbfc6c77ebcd22e595e1fd Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Mon, 7 Mar 2022 16:43:29 +0800
Subject: [PATCH 160/272] [Phi]Move bincount OP to phi (#39947)

* move bincount OP to phi

* fix dtype

* set_dtype by weights or x

* fix conflicts
---
 paddle/fluid/operators/bincount_op.cc     |  62 ++------
 paddle/fluid/operators/bincount_op.cu     | 162 ---------------------
 paddle/fluid/operators/bincount_op.h      | 109 --------------
 paddle/phi/infermeta/binary.cc            |  50 +++++++
 paddle/phi/infermeta/binary.h             |   4 +
 paddle/phi/kernels/bincount_kernel.h      |  28 ++++
 paddle/phi/kernels/cpu/bincount_kernel.cc | 106 ++++++++++++++
 paddle/phi/kernels/gpu/bincount_kernel.cu | 164 ++++++++++++++++++++++
 paddle/phi/ops/compat/bincount_sig.cc     |  25 ++++
 9 files changed, 386 insertions(+), 324 deletions(-)
 delete mode 100644 paddle/fluid/operators/bincount_op.cu
 delete mode 100644 paddle/fluid/operators/bincount_op.h
 create mode 100644 paddle/phi/kernels/bincount_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/bincount_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/bincount_kernel.cu
 create mode 100644 paddle/phi/ops/compat/bincount_sig.cc

diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc
index b37334a14ba..062e7d510d5 100644
--- a/paddle/fluid/operators/bincount_op.cc
+++ b/paddle/fluid/operators/bincount_op.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bincount_op.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BincountOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of BincountOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto minlength = ctx->Attrs().Get<int>("minlength");
-
-    PADDLE_ENFORCE_GE(minlength, 0,
-                      platform::errors::InvalidArgument(
-                          "The minlength should be greater than or equal to 0."
-                          "But received minlength is %d",
-                          minlength));
-
-    PADDLE_ENFORCE_EQ(input_dim.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) must be 1-D tensor."
-                          "But the dimension of Input(X) is [%d]",
-                          input_dim.size()));
-
-    if (ctx->HasInput("Weights")) {
-      auto weights_dim = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The 'shape' of Input(Weights) must be 1-D tensor."
-                            "But the dimension of Input(Weights) is [%d]",
-                            weights_dim.size()));
-
-      PADDLE_ENFORCE_EQ(
-          weights_dim[0], input_dim[0],
-          platform::errors::InvalidArgument(
-              "The 'shape' of Input(Weights) must be equal to the 'shape' of "
-              "Input(X)."
-              "But received: the 'shape' of Input(Weights) is [%s],"
-              "the 'shape' of Input(X) is [%s]",
-              weights_dim, input_dim));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type =
@@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor,
+                            PD_INFER_META(phi::BincountInferMeta));
 REGISTER_OPERATOR(
     bincount, ops::BincountOp, ops::BincountOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    bincount, ops::BincountKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    BincountInferShapeFunctor);
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
deleted file mode 100644
index cc576d0af92..00000000000
--- a/paddle/fluid/operators/bincount_op.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/bincount_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-inline int GET_BLOCKS(const int N) {
-  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
-}
-
-template <typename T, typename InputT, typename OutT>
-__global__ void KernelBincount(const InputT* input, const int total_elements,
-                               const bool has_weights, const T* weights,
-                               OutT* output) {
-  if (!has_weights) {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
-    }
-  } else {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]],
-                                      static_cast<OutT>(weights[i]));
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountCUDAInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  const int input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<T>(context.GetPlace());
-    return;
-  }
-  auto input_x = framework::EigenVector<InputT>::Flatten(*input);
-
-  framework::Tensor input_min_t, input_max_t;
-  auto* input_max_data =
-      input_max_t.mutable_data<InputT>({1}, context.GetPlace());
-  auto* input_min_data =
-      input_min_t.mutable_data<InputT>({1}, context.GetPlace());
-
-  auto input_max_scala = framework::EigenScalar<InputT>::From(input_max_t);
-  auto input_min_scala = framework::EigenScalar<InputT>::From(input_min_t);
-
-  auto* place = context.template device_context<DeviceContext>().eigen_device();
-  input_max_scala.device(*place) = input_x.maximum();
-  input_min_scala.device(*place) = input_x.minimum();
-
-  Tensor input_min_cpu, input_max_cpu;
-  paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
-                                    &input_max_cpu);
-  paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
-                                    &input_min_cpu);
-
-  InputT input_min = input_min_cpu.data<InputT>()[0];
-
-  PADDLE_ENFORCE_GE(
-      input_min, static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size =
-      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
-
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
-
-  auto stream =
-      context.template device_context<platform::CUDADeviceContext>().stream();
-
-  if (!has_weights) {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-
-    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        input_data, input_numel, has_weights, weights_data, output_data);
-  } else {
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-
-      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-
-      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
-                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountCUDAInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountCUDAInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bincount, ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
deleted file mode 100644
index 84256bf78e4..00000000000
--- a/paddle/fluid/operators/bincount_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  auto input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<InputT>(context.GetPlace());
-    return;
-  }
-
-  PADDLE_ENFORCE_GE(
-      *std::min_element(input_data, input_data + input_numel),
-      static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size = static_cast<int64_t>(*std::max_element(
-                            input_data, input_data + input_numel)) +
-                        1L;
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  if (has_weights) {
-    const T* weights_data = weights->data<T>();
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
-      }
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
-      }
-    }
-
-  } else {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-    for (int64_t i = 0; i < input_numel; i++) {
-      output_data[input_data[i]] += 1L;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 94b489906c6..55230aa8d05 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -456,6 +456,56 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out) {
+  auto input_dim = x.dims();
+
+  PADDLE_ENFORCE_GE(minlength,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The minlength should be greater than or equal to 0."
+                        "But received minlength is %d",
+                        minlength));
+
+  PADDLE_ENFORCE_EQ(
+      input_dim.size(),
+      1,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
+                                   "But the dimension of Input(X) is [%d]",
+                                   input_dim.size()));
+
+  if (weights.is_initialized()) {
+    auto weights_dim = weights->dims();
+    PADDLE_ENFORCE_EQ(weights_dim.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The 'shape' of Input(Weights) must be 1-D tensor."
+                          "But the dimension of Input(Weights) is [%d]",
+                          weights_dim.size()));
+
+    PADDLE_ENFORCE_EQ(
+        weights_dim[0],
+        input_dim[0],
+        phi::errors::InvalidArgument(
+            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+            "Input(X)."
+            "But received: the 'shape' of Input(Weights) is [%s],"
+            "the 'shape' of Input(X) is [%s]",
+            weights_dim,
+            input_dim));
+  }
+  out->set_dims(phi::make_ddim({-1}));
+  if (weights.is_initialized()) {
+    out->set_dtype(weights->dtype());
+  } else {
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index caf9185c900..106c22f7548 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -85,6 +85,10 @@ void BCELossInferMeta(const MetaTensor& input,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out);
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h
new file mode 100644
index 00000000000..3ba69d36548
--- /dev/null
+++ b/paddle/phi/kernels/bincount_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc
new file mode 100644
index 00000000000..c9dc44c1e04
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bincount_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename InputT>
+void BincountInner(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const paddle::optional<const DenseTensor&> weights,
+                   int minlength,
+                   DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  auto input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<InputT>(output);
+    return;
+  }
+
+  PADDLE_ENFORCE_GE(
+      *std::min_element(input_data, input_data + input_numel),
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size = static_cast<int64_t>(*std::max_element(
+                            input_data, input_data + input_numel)) +
+                        1L;
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  if (has_weights) {
+    const T* weights_data = weights->data<T>();
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
+      }
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
+      }
+    }
+
+  } else {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+    for (int64_t i = 0; i < input_numel; i++) {
+      output_data[input_data[i]] += 1L;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
new file mode 100644
index 00000000000..a4ec894790c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename InputT, typename OutT>
+__global__ void KernelBincount(const InputT* input,
+                               const int total_elements,
+                               const bool has_weights,
+                               const T* weights,
+                               OutT* output) {
+  if (!has_weights) {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+    }
+  } else {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]],
+                                      static_cast<OutT>(weights[i]));
+    }
+  }
+}
+
+template <typename Context, typename T, typename InputT>
+void BincountCUDAInner(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const paddle::optional<const DenseTensor&> weights,
+                       int minlength,
+                       DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  const int input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<T>(output);
+    return;
+  }
+  auto input_x = EigenVector<InputT>::Flatten(*input);
+  DenseTensor input_min_t, input_max_t;
+  input_max_t.Resize({1});
+  auto* input_max_data = dev_ctx.template Alloc<InputT>(&input_max_t);
+  input_min_t.Resize({1});
+  auto* input_min_data = dev_ctx.template Alloc<InputT>(&input_min_t);
+
+  auto input_max_scala = EigenScalar<InputT>::From(input_max_t);
+  auto input_min_scala = EigenScalar<InputT>::From(input_min_t);
+
+  auto* place = dev_ctx.eigen_device();
+  input_max_scala.device(*place) = input_x.maximum();
+  input_min_scala.device(*place) = input_x.minimum();
+
+  DenseTensor input_min_cpu, input_max_cpu;
+  paddle::framework::TensorCopySync(
+      input_max_t, phi::CPUPlace(), &input_max_cpu);
+  paddle::framework::TensorCopySync(
+      input_min_t, phi::CPUPlace(), &input_min_cpu);
+
+  InputT input_min = input_min_cpu.data<InputT>()[0];
+
+  PADDLE_ENFORCE_GE(
+      input_min,
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size =
+      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
+
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
+  auto stream = dev_ctx.stream();
+
+  if (!has_weights) {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+
+    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+        input_data, input_numel, has_weights, weights_data, output_data);
+  } else {
+    const auto& weights_type =
+        paddle::framework::TransToProtoVarType(weights->dtype());
+
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+
+      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
+                                          PADDLE_CUDA_NUM_THREADS,
+                                          0,
+                                          stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountCUDAInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountCUDAInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/ops/compat/bincount_sig.cc b/paddle/phi/ops/compat/bincount_sig.cc
new file mode 100644
index 00000000000..35067c256ed
--- /dev/null
+++ b/paddle/phi/ops/compat/bincount_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping);
-- 
GitLab


From c09adab84fdd1fb13ac751871787d3337ba3ca77 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Mon, 7 Mar 2022 16:57:56 +0800
Subject: [PATCH 161/272] refactor unittest for nearest_interp_v2_op_xpu.
 test=kunlun (#39804)

* refactor unittest for nearest_interp_v2_op_xpu. test=kunlun

* fix code style. test=kunlun

* fix code style. test=kunlun
---
 .../xpu/test_nearest_interp_v2_op_xpu.py      | 731 +++++++++---------
 1 file changed, 349 insertions(+), 382 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 8c1ce68e9d0..7a3b4a5a217 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
 import sys
 sys.path.append("..")
+
+import paddle
+
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -158,390 +159,356 @@ def nearest_neighbor_interp3d_np(X,
     return out.astype(X.dtype)
 
 
-class TestNearestInterpOp(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
-            in_d = 1
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_d = 1
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
-            in_d = self.input_shape[2]
-            in_h = self.input_shape[3]
-            in_w = self.input_shape[4]
-        else:
-            in_d = self.input_shape[1]
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        scale_d = 0
-        scale_h = 0
-        scale_w = 0
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_d = scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_d = scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                if len(self.scale) == 5:
-                    scale_w = self.scale[2]
-                    scale_h = self.scale[1]
-                    scale_d = self.scale[0]
-                else:
-                    scale_w = self.scale[1]
-                    scale_h = self.scale[0]
+class XPUNearestInterpOpWrapper(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'nearest_interp_v2'
+        self.use_dynamic_create_class = False
 
-            out_h = int(in_h * scale_h)
-            out_w = int(in_w * scale_w)
-            out_d = int(in_d * scale_d)
-        else:
-            if len(self.input_shape) == 5:
-                out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
+    class TestNearestInterpOp(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
 
-        if len(self.input_shape) == 4:
-            output_np = nearest_neighbor_interp_np(
-                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
-                self.actual_shape, self.align_corners, self.data_layout)
-        elif len(self.input_shape) == 5:
-            output_np = nearest_neighbor_interp3d_np(
-                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
-                self.out_size, self.actual_shape, self.align_corners,
-                self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        if len(self.input_shape) == 5:
-            self.attrs = {
-                'out_d': self.out_d,
-                'out_h': self.out_h,
-                'out_w': self.out_w,
-                'interp_method': self.interp_method,
-                'align_corners': self.align_corners,
-                'data_layout': self.data_layout
-            }
-        else:
+            self.out_size = None
+            self.actual_shape = None
+            self.data_layout = 'NCHW'
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+            # in
+            if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+                in_d = 1
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+            else:
+                in_d = 1
+                in_h = self.input_shape[1]
+                in_w = self.input_shape[2]
+
+            if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+                in_d = self.input_shape[2]
+                in_h = self.input_shape[3]
+                in_w = self.input_shape[4]
+            else:
+                in_d = self.input_shape[1]
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+
+            # scale
+            scale_d = 0
+            scale_h = 0
+            scale_w = 0
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_d = scale_h = scale_w = float(self.scale)
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_d = scale_w = scale_h = self.scale[0]
+                    self.scale = [self.scale[0], self.scale[0]]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    if len(self.scale) == 5:
+                        scale_w = self.scale[2]
+                        scale_h = self.scale[1]
+                        scale_d = self.scale[0]
+                    else:
+                        scale_w = self.scale[1]
+                        scale_h = self.scale[0]
+
+                out_h = int(in_h * scale_h)
+                out_w = int(in_w * scale_w)
+                out_d = int(in_d * scale_d)
+            else:
+                if len(self.input_shape) == 5:
+                    out_d = self.out_d
+                out_h = self.out_h
+                out_w = self.out_w
+
+            # output_np
+            if len(self.input_shape) == 4:
+                output_np = nearest_neighbor_interp_np(
+                    input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                    self.actual_shape, self.align_corners, self.data_layout)
+            elif len(self.input_shape) == 5:
+                output_np = nearest_neighbor_interp3d_np(
+                    input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                    self.out_size, self.actual_shape, self.align_corners,
+                    self.data_layout)
+            self.outputs = {'Out': output_np}
+
+            self.inputs = {'X': input_np}
+            if self.out_size is not None:
+                self.inputs['OutSize'] = self.out_size
+            if self.actual_shape is not None:
+                self.inputs['OutSize'] = self.actual_shape
+
+            if len(self.input_shape) == 5:
+                self.attrs = {
+                    'out_d': self.out_d,
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+            else:
+                self.attrs = {
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+
+            if self.scale:
+                self.attrs['scale'] = self.scale
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 3, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.out_size = np.array([3, 3]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [4, 1, 1, 7, 8]
+            self.out_d = 1
+            self.out_h = 1
+            self.out_w = 1
+            self.scale = 0.
+            self.align_corners = True
+    """
+
+    class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+
+    class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+
+    class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [4, 1, 7, 8]
+            self.out_h = 1
+            self.out_w = 1
+            self.out_size = np.array([2, 2]).astype("int32")
+
+    class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = np.array([11, 11]).astype("int32")
+
+    class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([65, 129]).astype("int32")
+
+    class TestNearestNeighborInterpSame(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [2, 3, 32, 64]
+            self.out_h = 32
+            self.out_w = 64
+
+    class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support NHWC data_layout
+    class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [2, 4, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.scale = 0.
+            self.out_size = np.array([3, 8]).astype("int32")
+            self.align_corners = True
+            self.data_layout = "NHWC"
+    """
+
+    class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+        def set_align_corners(self):
+            self.align_corners = False
+
+    class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 5, 7]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 1.5
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [2.0, 3.0]
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [3, 2, 4, 7, 5]
+            self.out_d = 8
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [4.0, 2.0, 3.0]
+            self.out_size = np.array([8, 66, 40]).astype("int32")
+            self.align_corners = True
+    """
+
+    class TestNearestInterpOp_attr_tensor(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+
+            self.out_size = None
+            self.actual_shape = None
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            self.shape_by_1Dtensor = False
+            self.scale_by_1Dtensor = False
             self.attrs = {
-                'out_h': self.out_h,
-                'out_w': self.out_w,
                 'interp_method': self.interp_method,
                 'align_corners': self.align_corners,
-                'data_layout': self.data_layout
             }
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 1, 7, 8]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-"""
-
-
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support NHWC data_layout
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-"""
-
-
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [2.0, 3.0]
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighbor3DInterp(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 4, 7, 5]
-        self.out_d = 8
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [4.0, 2.0, 3.0]
-        self.out_size = np.array([8, 66, 40]).astype("int32")
-        self.align_corners = True
-"""
-
-
-class TestNearestInterpOp_attr_tensor(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
-            out_h = int(self.input_shape[2] * scale_h)
-            out_w = int(self.input_shape[3] * scale_w)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
 
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+            self.inputs = {'X': input_np}
+
+            if self.scale_by_1Dtensor:
+                self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+            elif self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_h = scale_w = float(self.scale)
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_w = scale_h = self.scale[0]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+                out_h = int(self.input_shape[2] * scale_h)
+                out_w = int(self.input_shape[3] * scale_w)
+            else:
+                out_h = self.out_h
+                out_w = self.out_w
+
+            if self.shape_by_1Dtensor:
+                self.inputs['OutSize'] = self.out_size
+            elif self.out_size is not None:
+                size_tensor = []
+                for index, ele in enumerate(self.out_size):
+                    size_tensor.append(("x" + str(index), np.ones(
+                        (1)).astype('int32') * ele))
+                self.inputs['SizeTensor'] = size_tensor
+
+            self.attrs['out_h'] = self.out_h
+            self.attrs['out_w'] = self.out_w
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    self.scale = [self.scale[0], self.scale[0]]
+                self.attrs['scale'] = self.scale
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+                self.align_corners)
+            self.outputs = {'Out': output_np}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 5, 4, 4]
+            self.out_h = 3
+            self.out_w = 3
+            self.out_size = [3, 3]
+
+    # out_size is a tensor list
+    class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = [8, 12]
+
+    # out_size is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+            self.shape_by_1Dtensor = True
+
+    # scale is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.0
+            self.out_size = None
+            self.scale_by_1Dtensor = True
+
+
+support_types = get_xpu_op_support_types('nearest_interp_v2')
+for stype in support_types:
+    create_test_class(globals(), XPUNearestInterpOpWrapper, stype)
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 79a32715b9aca4a6e522ffcf91bac82e7a6cd380 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Mon, 7 Mar 2022 17:24:16 +0800
Subject: [PATCH 162/272] [OpTest] Support to test paddle API end-to-end for
 check_eager (#40169)

* add python api test in TestOp

* test_python_api if self.python_api is set

* fix code by CR
---
 paddle/fluid/imperative/tracer.cc             | 33 +++++++
 paddle/fluid/imperative/tracer.h              |  5 +
 paddle/fluid/pybind/imperative.cc             | 21 +++++
 .../paddle/fluid/tests/unittests/op_test.py   | 94 +++++++++++++++++++
 .../fluid/tests/unittests/test_selu_op.py     |  1 +
 5 files changed, 154 insertions(+)

diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 4336a5c77c1..01c9d2847e0 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -18,12 +18,14 @@
 #include <utility>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/common/place.h"
 
 DECLARE_bool(use_mkldnn);
 DECLARE_string(tracer_mkldnn_ops_on);
@@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
   return false;
 }
 
+phi::KernelSignature Tracer::GetExpectedKernelSignature(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+  auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
+  framework::RuntimeContext ctx({}, {});
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(phi::CPUPlace());
+  const auto& op_info = op->Info();
+  auto* attr_checker = op_info.Checker();
+  if (attr_checker) {
+    attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
+  }
+  static paddle::framework::AttributeMap empty_attrs_map = {};
+  const paddle::framework::AttributeMap& default_attrs =
+      attr_checker == nullptr ? empty_attrs_map
+                              : attr_checker->GetDefaultAttrMap();
+  auto dygraph_exe_ctx =
+      imperative::DygraphExecutionContext<imperative::VarBase>(
+          *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
+          default_attrs);
+  auto* opbase_with_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(op.get());
+  PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr,
+                    platform::errors::InvalidArgument(
+                        "This op type:`%s` is not a OperatorWithKernel, only "
+                        "OperatorWithKernel can get KernelSignature",
+                        type));
+  return phi::KernelSignature(
+      std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx)));
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 73ecbbe6143..fd13fce6a6e 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace imperative {
@@ -154,6 +155,10 @@ class Tracer {
     }
   }
 
+  phi::KernelSignature GetExpectedKernelSignature(
+      const std::string& type, const NameVarBaseMap& ins,
+      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3da17b95a66..9b373a58181 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -56,6 +56,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 #include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace pybind {
@@ -2073,6 +2074,26 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
+      .def("_get_kernel_signature",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs) {
+             // TODO(xiongkun): move this function outside of tracer.
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               auto to_vector = [](paddle::SmallVector<std::string> &vec) {
+                 return std::vector<std::string>(vec.begin(), vec.end());
+               };
+               auto ret = self.GetExpectedKernelSignature(type, ins_map,
+                                                          outs_map, attrs);
+               auto kernelsig_ins = to_vector(std::get<0>(ret.args));
+               auto kernelsig_attrs = to_vector(std::get<1>(ret.args));
+               auto kernelsig_outs = to_vector(std::get<2>(ret.args));
+               return std::make_tuple(kernelsig_ins, kernelsig_attrs,
+                                      kernelsig_outs);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 628791afef5..0c7f269a087 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -29,6 +29,7 @@ from copy import copy
 
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.framework import _dygraph_tracer
 import paddle.fluid.core as core
 from paddle.fluid.framework import _in_eager_mode
 from paddle.fluid.framework import _test_eager_guard
@@ -395,6 +396,7 @@ class OpTest(unittest.TestCase):
             hasattr(self, "attrs") and "use_xpu" in self.attrs and
             self.attrs["use_xpu"] == True)
 
+    # set the self.output_dtype .
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
@@ -679,6 +681,91 @@ class OpTest(unittest.TestCase):
         else:
             return var_dict
 
+    def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
+        """ for quick verify, here we take a simplest strategy:
+                1. we only check variable in api_outs.
+                2. we simply check the numpy (tensor) .
+                3. we set atol and rtol as 1e-5, because they are unrelated to dtype.
+        """
+        for name in api_outs:
+            np_api = np.array(api_outs[name])
+            np_dyg = np.array(dygraph_outs[name])
+            self.assertTrue(
+                np.allclose(
+                    np_api, np_dyg, equal_nan=False),
+                "Output (" + name + ") has diff at " + str(place) + "\nExpect "
+                + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " +
+                self.__class__.__name__)
+
+    def _calc_python_api_output(self, place):
+        def prepare_python_api_arguments(op_proto_ins, op_proto_attrs,
+                                         kernel_sig):
+            """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
+            """
+            # NOTE(xiongkun): why don't use input arguments dicts ? 
+            # Because we don't know the python api name of each arguments.
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            input_arguments = [op_proto_ins[name] for name in inputs_sig]
+            attr_arguments = {
+                name: op_proto_attrs[name]
+                for name in attrs_sig if name in op_proto_attrs
+            }
+            return input_arguments, attr_arguments
+
+        def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
+            if not isinstance(ret_tuple, (tuple, list)):
+                ret_tuple = [ret_tuple]
+            assert len(output_sig) == len(
+                ret_tuple), "expect %d outputs, but get %d outputs" % (
+                    len(output_sig), len(ret_tuple))
+            return {a: b for a, b in zip(output_sig, ret_tuple)}
+
+        def assumption_assert_and_transform(args, argvs):
+            """
+            currently only support "X" is [Tensor], don't support multi-tensor in "X"
+            """
+            for inp in args:
+                assert isinstance(inp, list) and len(
+                    inp
+                ) == 1, "currently only support `X` is [Tensor], don't support multi-tensor in `X`"
+            args = [inp[0] for inp in args]
+            return args, argvs
+
+        def cal_python_api(python_api, args, argvs, kernel_sig):
+            args, argvs = assumption_assert_and_transform(args, argvs)
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            ret_tuple = python_api(*args, **argvs)
+            return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
+
+        with fluid.dygraph.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+            op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+            # prepare input variable
+            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
+                                                          True, False, block)
+            # prepare output variable
+            outputs = self.append_input_output_for_dygraph(
+                op_proto, self.outputs, False, False, block)
+
+            # prepare attrbutes
+            attrs_outputs = {}
+            if hasattr(self, "attrs"):
+                for attrs_name in self.attrs:
+                    if self.attrs[attrs_name] is not None:
+                        attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
+            kernel_sig = _dygraph_tracer()._get_kernel_signature(
+                self.op_type, inputs, outputs, attrs_outputs)
+
+            assert hasattr(
+                self, "python_api"
+            ), "Please set the `self.python_api` if you want to compare python api output."
+            arg, argv = prepare_python_api_arguments(inputs, attrs_outputs,
+                                                     kernel_sig)
+            """ we directly return the cal_python_api value because the value is already tensor. 
+            """
+            return cal_python_api(self.python_api, arg, argv, kernel_sig)
+
     def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
         with fluid.dygraph.base.guard(place=place):
@@ -699,6 +786,7 @@ class OpTest(unittest.TestCase):
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
             block.append_op(
                 type=self.op_type,
                 inputs=inputs,
@@ -1150,6 +1238,12 @@ class OpTest(unittest.TestCase):
         if check_dygraph:
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
+
+            if hasattr(self, "python_api"):
+                api_outs = self._calc_python_api_output(place)
+                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
+                                                     place)
+
         if check_eager:
             with _test_eager_guard():
                 eager_dygraph_outs = self._calc_dygraph_output(
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index e71adae8d9b..f1619881794 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -42,6 +42,7 @@ def ref_selu(x,
 class SeluTest(OpTest):
     def setUp(self):
         self.op_type = "selu"
+        self.python_api = paddle.nn.functional.selu
         self.x_shape = [3, 5, 5, 10]
         self.dtype = np.float64
         self.init_x_shape()
-- 
GitLab


From b798fb071e8f2861f6c59b073f3389ea1d897fde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Mon, 7 Mar 2022 21:38:16 +0800
Subject: [PATCH 163/272] [infrt] fold the infrt.cvtTensorOp. test=develop
 (#40214)

---
 cmake/external/llvm.cmake                     |  4 +-
 paddle/infrt/CMakeLists.txt                   |  2 -
 paddle/infrt/dialect/infrt/CMakeLists.txt     |  2 +
 .../infrt/dialect/infrt/pass/CMakeLists.txt   |  7 +++
 .../infrt/dialect/infrt/pass/infrt_op_fuse.td | 23 ++++++++
 .../dialect/infrt/pass/infrt_op_fuse_pass.cc  | 52 +++++++++++++++++++
 .../dialect/infrt/pass/infrt_op_fuse_pass.h   | 24 +++++++++
 paddle/infrt/dialect/pd_op_base.td            |  2 +-
 paddle/infrt/dialect/phi/phi_ir_exec.cc       |  2 +
 .../dialect/{pten => phi}/dense_tensor.mlir   |  0
 .../pten_pass.mlir => phi/phi_pass.mlir}      |  0
 tools/infrt/custom_pdop.td                    |  2 +-
 12 files changed, 114 insertions(+), 6 deletions(-)
 create mode 100644 paddle/infrt/dialect/infrt/pass/CMakeLists.txt
 create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
 create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
 create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
 rename paddle/infrt/tests/dialect/{pten => phi}/dense_tensor.mlir (100%)
 rename paddle/infrt/tests/dialect/{pten/pten_pass.mlir => phi/phi_pass.mlir} (100%)

diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index a7a9e85ffd7..9f6fd32ad98 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -100,8 +100,8 @@ endfunction()
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
   mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
-  add_public_tablegen_target(${td_base}_IncGen)
-  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+  add_public_tablegen_target(MLIR${td_base}IncGen)
+  add_dependencies(mlir-headers MLIR${td_base}IncGen)
 endfunction()
 
 # Execute the mlir script with infrt-exec program.
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index f2768f3dfa8..ed29b5b44c7 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -95,9 +95,7 @@ set(infrt_mlir_incs
         dense_tensor_inc
         pd_ops_inc
         pd_extra_ops_inc
-        rewrite_inc
         trt_ops_inc
-        pd_lower_to_trt_inc
         )
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index daf710e0baf..08ce2d4707b 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -13,3 +13,5 @@ mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
 mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
 add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
 add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
new file mode 100644
index 00000000000..19c12251a2e
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(infrt_op_fuse)
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
new file mode 100644
index 00000000000..ef702650b6f
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -0,0 +1,23 @@
+#ifndef INFRT_OP_FUSE
+#define INFRT_OP_FUSE
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt/infrt_ops.td"
+include "paddle/infrt/dialect/pd_ops.td"
+
+def FuseCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
+       (Infrt_CvtTensorOp $arg)>;
+
+def FuseFeedCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+       (PD_FeedOp $name)>;
+
+def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
+def RedundantCvtTensorOptPattern : Pat<
+  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+
+
+#endif // INFRT_OP_FUSE
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
new file mode 100644
index 00000000000..cb16e054418
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+namespace {
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * infrtOpFusePass.
+ */
+struct InfrtOpFusePass
+    : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+  void runOnFunction() override;
+};
+// Implementation of the InfrtOpFusePass.
+void InfrtOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  // Fuse pd.return Operation
+  auto terminator_op = getFunction().front().getTerminator();
+  if (nullptr == terminator_op) return;
+  for (auto operand : terminator_op->getOperands()) {
+    auto *op1 = operand.getDefiningOp();
+    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    if (!cvt_op) continue;
+    mlir::Value value = cvt_op.input();
+    operand.replaceAllUsesWith(value);
+    cvt_op.erase();
+  }
+}
+}  // namespace
+std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
+  return std::make_unique<InfrtOpFusePass>();
+}
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
new file mode 100644
index 00000000000..ef349a7bbc4
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/Pass/Pass.h>
+
+namespace infrt {
+/*
+ * infrtOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> createInfrtOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index 266bdf60de7..26425e3945c 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -75,7 +75,7 @@ def PD_ElementType : Type<Or<[PD_Float.predicate,
 // def PD_Tensor : TensorOf<[PD_ElementType]>;
 def PD_Tensor1 : TensorOf<[PD_ElementType]>;
 
-def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">;
+def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor, DenseTensor],"pd.ttype">;
 
 def PD_Tensor_Array : VectorOf<[PD_Tensor]>;
 
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
index 1df929895b1..559fb90a64a 100644
--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -16,6 +16,7 @@
 #include <iostream>
 #include <string>
 #include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 
@@ -38,6 +39,7 @@ int main(int argc, char** argv) {
                                              infrt::PrecisionType::FLOAT32,
                                              infrt::LayoutType::NCHW}};
   phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
similarity index 100%
rename from paddle/infrt/tests/dialect/pten/dense_tensor.mlir
rename to paddle/infrt/tests/dialect/phi/dense_tensor.mlir
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
similarity index 100%
rename from paddle/infrt/tests/dialect/pten/pten_pass.mlir
rename to paddle/infrt/tests/dialect/phi/phi_pass.mlir
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index 83e29578312..2139fbd8155 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -1,4 +1,4 @@
-def PD_FeedOp : PD_Op<"feed"> {
+def PD_FeedOp : PD_Op<"feed", [NoSideEffect]> {
   let summary = "Feed Op";
 
   let description = [{
-- 
GitLab


From 10325a82e1032c3397b6f6611f558eb18ede0b07 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 8 Mar 2022 09:55:10 +0800
Subject: [PATCH 164/272] add python profiler package (#40065)

* add python profiler package

* update according to review

* fix bug

* fix bug

* fix bug

* add unit test

* Revert "add unit test"

This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48.

* reduce for pr

* add unit test

* modify for pr

* fix unittest

* update for ci coverage

* modify according to review

* fix bug

* improve coverage
---
 paddle/fluid/platform/profiler.cc             |   4 +
 paddle/fluid/platform/profiler.h              |   1 +
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 paddle/fluid/pybind/pybind.cc                 |  85 ++++
 python/paddle/fluid/core.py                   |   2 +
 .../fluid/tests/unittests/test_newprofiler.py | 129 +++++
 python/paddle/profiler/__init__.py            |  26 +
 python/paddle/profiler/profiler.py            | 469 ++++++++++++++++++
 python/paddle/profiler/profiler_statistic.py  |  31 ++
 python/paddle/profiler/utils.py               |  90 ++++
 python/setup.py.in                            |   1 +
 11 files changed, 839 insertions(+), 1 deletion(-)
 create mode 100755 python/paddle/fluid/tests/unittests/test_newprofiler.py
 create mode 100644 python/paddle/profiler/__init__.py
 create mode 100644 python/paddle/profiler/profiler.py
 create mode 100644 python/paddle/profiler/profiler_statistic.py
 create mode 100644 python/paddle/profiler/utils.py

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 866bf3c66aa..feb72bce72b 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -489,6 +489,10 @@ void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
 
 void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; }
 
+void DisableHostEventRecorder() {
+  FLAGS_enable_host_event_recorder_hook = false;
+}
+
 std::string PrintHostEvents() {
   std::ostringstream oss;
   auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 122e19b7c28..78275341cbb 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -216,6 +216,7 @@ void NvprofEnableRecordEvent();
 void NvprofDisableRecordEvent();
 
 void EnableHostEventRecorder();
+void DisableHostEventRecorder();
 
 // Defined for UT
 std::string PrintHostEvents();
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5e61133510d..7ff501ef43d 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0a1cf604d2e..fcfc3e6a379 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -78,6 +78,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
@@ -2913,6 +2916,88 @@ All parameter, weight, gradient are variables in Paddle.
   });
 
   m.def("size_of_dtype", framework::SizeOfType);
+  py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
+      .def(py::init<>())
+      .def("get_data", &paddle::platform::ProfilerResult::GetData,
+           py::return_value_policy::automatic_reference)
+      .def("save", &paddle::platform::ProfilerResult::Save)
+      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
+
+  py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
+      .def_readwrite("type", &paddle::platform::DevicePythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::DevicePythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::DevicePythonNode::end_ns)
+      .def_readwrite("device_id",
+                     &paddle::platform::DevicePythonNode::device_id)
+      .def_readwrite("context_id",
+                     &paddle::platform::DevicePythonNode::context_id)
+      .def_readwrite("stream_id",
+                     &paddle::platform::DevicePythonNode::stream_id);
+
+  py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::HostPythonNode::name)
+      .def_readwrite("type", &paddle::platform::HostPythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::HostPythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::HostPythonNode::end_ns)
+      .def_readwrite("process_id",
+                     &paddle::platform::HostPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("children_node",
+                     &paddle::platform::HostPythonNode::children_node_ptrs)
+      .def_readwrite("runtime_node",
+                     &paddle::platform::HostPythonNode::runtime_node_ptrs)
+      .def_readwrite("device_node",
+                     &paddle::platform::HostPythonNode::device_node_ptrs);
+
+  py::class_<paddle::platform::Profiler>(m, "_Profiler")
+      .def("create", &paddle::platform::Profiler::Create,
+           py::return_value_policy::take_ownership)
+      .def("prepare",
+           [](paddle::platform::Profiler *profiler) {
+             platform::EnableHostEventRecorder();
+             profiler->Prepare();
+           })
+      .def("start", &paddle::platform::Profiler::Start)
+      .def("stop",
+           [](paddle::platform::Profiler *profiler) {
+             platform::DisableHostEventRecorder();
+             return profiler->Stop();
+           },
+           py::return_value_policy::automatic_reference);
+
+  py::class_<paddle::platform::ProfilerOptions>(m, "ProfilerOptions")
+      .def(py::init<>())
+      .def_readwrite("trace_switch",
+                     &paddle::platform::ProfilerOptions::trace_switch);
+
+  py::class_<platform::RecordEvent>(m, "_RecordEvent")
+      .def(py::init([](std::string name, platform::TracerEventType type) {
+        return std::make_unique<platform::RecordEvent>(
+            name, type, 1, paddle::platform::EventRole::kOrdinary);
+      }))
+      .def("end", [](platform::RecordEvent *event) { event->End(); });
+
+  py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
+      .value("Operator", paddle::platform::TracerEventType::Operator)
+      .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
+      .value("ProfileStep", paddle::platform::TracerEventType::ProfileStep)
+      .value("CudaRuntime", paddle::platform::TracerEventType::CudaRuntime)
+      .value("Kernel", paddle::platform::TracerEventType::Kernel)
+      .value("Memcpy", paddle::platform::TracerEventType::Memcpy)
+      .value("Memset", paddle::platform::TracerEventType::Memset)
+      .value("UserDefined", paddle::platform::TracerEventType::UserDefined)
+      .value("OperatorInner", paddle::platform::TracerEventType::OperatorInner)
+      .value("Forward", paddle::platform::TracerEventType::Forward)
+      .value("Backward", paddle::platform::TracerEventType::Backward)
+      .value("Optimization", paddle::platform::TracerEventType::Optimization)
+      .value("Communication", paddle::platform::TracerEventType::Communication)
+      .value("PythonOp", paddle::platform::TracerEventType::PythonOp)
+      .value("PythonUserDefined",
+             paddle::platform::TracerEventType::PythonUserDefined);
+  m.def("load_profiler_result", &paddle::platform::LoadProfilerResult);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 5e023e9248c..617ab630528 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -283,6 +283,7 @@ if avx_supported():
         from .core_avx import _set_cached_executor_build_strategy
         from .core_avx import _device_synchronize
         from .core_avx import _get_current_stream
+        from .core_avx import _Profiler, _ProfilerResult, _RecordEvent
         from .core_avx import _set_current_stream
         if sys.platform != 'win32':
             from .core_avx import _set_process_pids
@@ -344,6 +345,7 @@ if load_noavx:
         from .core_noavx import _device_synchronize
         from .core_noavx import _get_current_stream
         from .core_noavx import _set_current_stream
+        from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent
         if sys.platform != 'win32':
             from .core_noavx import _set_process_pids
             from .core_noavx import _erase_process_pids
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
new file mode 100755
index 00000000000..12fb0fa61b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.profiler as profiler
+
+
+class TestProfiler(unittest.TestCase):
+    def test_profiler(self):
+        def my_trace_back(prof):
+            profiler.export_chrome_tracing('./test_profiler_chrometracing/')(
+                prof)
+            profiler.export_protobuf('./test_profiler_pb/')(prof)
+
+        x_value = np.random.randn(2, 3, 3)
+        x = paddle.to_tensor(
+            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        y = x / 2.0
+        ones_like_y = paddle.ones_like(y)
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=(1, 2)) as prof:
+            with profiler.RecordEvent(name='test'):
+                y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=1, record=1, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=0, record=2, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(3):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+
+        def my_sheduler(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD_AND_RETURN
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        def my_sheduler1(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler1) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=1, ready=1, record=2, repeat=1, skip_first=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
+                prof.step()
+
+        prof.export(path='./test_profiler_pb.pb', format='pb')
+        prof.summary()
+        result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py
new file mode 100644
index 00000000000..4999e703f2a
--- /dev/null
+++ b/python/paddle/profiler/__init__.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .profiler import ProfilerState, ProfilerTarget
+from .profiler import make_scheduler, export_chrome_tracing, export_protobuf
+from .profiler import Profiler
+from .profiler import TracerEventType
+from .utils import RecordEvent, load_profiler_result
+from .profiler_statistic import SortedKeys
+
+__all__ = [
+    'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
+    'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
+    'load_profiler_result', 'SortedKeys'
+]
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
new file mode 100644
index 00000000000..dc637bf9830
--- /dev/null
+++ b/python/paddle/profiler/profiler.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+import datetime
+from enum import Enum
+from typing import Any, Callable, Iterable, Optional, Union
+from warnings import warn
+
+import paddle
+from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
+                               TracerEventType)
+
+from .utils import RecordEvent, wrap_optimizers
+from .profiler_statistic import SortedKeys
+
+
+class ProfilerState(Enum):
+    r"""
+    Profiler state that can be specified to control profiler action.
+
+    CLOSED: The profilers are closed.
+    READY:  The profilers are open, but the data will not be recorded.
+            This state is used for reducing overhead influence when profilers start.
+    RECORD: The profilers are open, and the data will be recorded.
+    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, 
+            the collected data will be returned.
+    """
+    CLOSED = 0
+    READY = 1
+    RECORD = 2
+    RECORD_AND_RETURN = 3  # the last step of RECORD 
+
+
+class ProfilerTarget(Enum):
+    r"""
+    Target device for profiling.
+    """
+    CPU = 0
+    GPU = 1
+
+
+def make_scheduler(*,
+                   closed: int,
+                   ready: int,
+                   record: int,
+                   repeat: int=0,
+                   skip_first: int=0) -> Callable:
+    r"""
+    Return a scheduler function, which scheduler the state according to the setting.
+    The state transform confirms to:
+
+    (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
+    START -> skip_first -> closed -> ready    ->    record       ->      END
+                            |                        |
+                            |                        | (if has_repeated < repeat)
+                            - - - - - - - - - - - -
+    Note that repeat <= 0 means the cycle will continue until the profiler exits.    
+
+    Parameters:
+        closed(int): The number of steps in state ProfilerState.CLOSED.
+        ready(int):  The number of steps in state ProfilerState.READY. 
+        record(int): The number of steps in state ProfilerState.RECORD.    
+        repeat(int): The number of cycles to repeat above state transform.
+        skip_first(int): The number of first steps to drop, not participate in the state transform.
+
+    Returns:
+        A scheduler function, conforms to above state transform setting.
+
+    Examples:
+        1. profiling range [2, 5]
+        batch 0: closed, batch 1: ready, batch [2, 5] record
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, repeat=1)
+        2. profiling range [3,6], [9,12], [15,18]...
+        batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, skip_first=1)
+    """
+
+    def getScheduleState(step: int) -> ProfilerState:
+        assert step >= 0
+        if step < skip_first:  # within skip_first, just skip
+            return ProfilerState.CLOSED
+        step = step - skip_first
+        period_steps = closed + ready + record
+        has_repeated = step // period_steps
+        if repeat > 0 and has_repeated >= repeat:  # the period has repeated repeat times, return CLOSED state
+            return ProfilerState.CLOSED
+        mod_step = step % period_steps
+        if mod_step < closed:
+            return ProfilerState.CLOSED
+        elif mod_step >= closed and mod_step < closed + ready:
+            return ProfilerState.READY
+        else:
+            if mod_step < period_steps - 1:
+                return ProfilerState.RECORD
+            else:
+                return ProfilerState.RECORD_AND_RETURN
+    assert closed >= 0 and ready >= 0 and record > 0 and \
+             repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments"
+    if ready == 0:
+        warn("Profiler will record data after enabling profiler immediately, \
+          some data collected at the beginning of profiling may be 'noisy' because of overhead."
+             )
+    return getScheduleState
+
+
+def _default_state_scheduler(step: int):
+    r"""
+    A default state scheduler, keep recording from the begining of the profiler until ending.
+    """
+    return ProfilerState.RECORD
+
+
+def export_chrome_tracing(dir_name: str,
+                          worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to chrome tracing format file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.json'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "json")
+
+    return handle_fn
+
+
+def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to protobuf file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_protobuf('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.pb'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "pb")
+
+    return handle_fn
+
+
+def _get_supported_targets() -> Iterable[ProfilerTarget]:
+    r"""
+    Get the current supported profiler target in the system.
+    """
+    if paddle.device.is_compiled_with_cuda():
+        return [ProfilerTarget.CPU, ProfilerTarget.GPU]
+    return [ProfilerTarget.CPU]
+
+
+class Profiler:
+    r"""
+    Profiler context manager, user interface to manage profile process.
+
+    Parameters:
+        targets (iterable): list of tracing targets, currently supported values:
+        ``paddle.profiler.ProfilerTarget.CPU``,
+        ``paddle.profiler.ProfilerTarget.GPU``.
+        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. 
+            If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+            which means profiling range [start_batch, end_batch).
+        on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
+            This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+            
+    Examples:
+        1. profiling range [2, 5)
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (2, 5),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        2. profiling range [2,4], [7, 9], [11,13]
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        3. Use profiler without context manager, and use default parameters
+        .. code-block:: python
+        import paddle.profiler as profiler
+        p = profiler.Profiler()
+        p.start()
+        for iter in range(N):
+            train()
+            p.step()
+        p.stop()
+        p.summary()
+    """
+
+    def __init__(
+            self,
+            *,
+            targets: Optional[Iterable[ProfilerTarget]]=None,
+            scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None,
+            on_trace_ready: Optional[Callable[..., Any]]=None):
+        supported_targets = _get_supported_targets()
+        if targets:
+            self.targets = set(targets)
+            for target in targets:
+                if target not in supported_targets:
+                    self.targets.remove(target)
+                    warn("Profiling {} is not supported in current context.".
+                         format(target))
+        else:
+            self.targets = supported_targets
+        profileoption = ProfilerOptions()
+        if ProfilerTarget.CPU in self.targets:
+            profileoption.trace_switch |= 1
+        if ProfilerTarget.GPU in self.targets:
+            profileoption.trace_switch |= (1 << 1)
+        wrap_optimizers()
+        self.profiler = _Profiler.create(profileoption)
+        if callable(scheduler):
+            self.scheduler = scheduler
+        elif isinstance(scheduler, (tuple, list)):
+            assert len(scheduler) == 2 and scheduler[1] > scheduler[0]
+            start_batch, end_batch = scheduler
+            start_batch = max(start_batch, 0)
+            if start_batch >= 1:
+                self.scheduler = make_scheduler(
+                    closed=max(start_batch - 1, 0),
+                    ready=1,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+            else:
+                self.scheduler = make_scheduler(
+                    closed=0,
+                    ready=0,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+        else:
+            self.scheduler = _default_state_scheduler
+
+        if on_trace_ready == None:
+            self.on_trace_ready = export_chrome_tracing('./profiler_log/')
+        else:
+            self.on_trace_ready = on_trace_ready
+        self.step_num = 0
+        self.previous_state = ProfilerState.CLOSED
+        self.current_state = self.scheduler(self.step_num)
+        self.record_event = None
+        self.profiler_result = None
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+
+    def start(self):
+        r'''
+        Start profiler and enter the first profiler step(0).
+        State transformed from CLOSED to self.current_state and trigger corresponding action. 
+        '''
+        # CLOSED -> self.current_state
+        if self.current_state == ProfilerState.READY:
+            self.profiler.prepare()
+        elif self.current_state == ProfilerState.RECORD:
+            self.profiler.prepare()
+            self.profiler.start()
+        elif self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler.prepare()
+            self.profiler.start()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def stop(self):
+        r'''
+        Stop profiler and State transformed from self.current_state to CLOSED.
+        Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
+        '''
+        # self.current_state -> CLOSED
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        if self.current_state == ProfilerState.READY:
+            warn(
+                "Inproper Profiler state transform: READY->CLOSED, profiler will start and stop without saving data"
+            )
+            self.profiler.start()
+            self.profiler.stop()
+        if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler_result = self.profiler.stop()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def step(self):
+        r"""
+        Signals the profiler that the next profiling step has started.
+        Get the new ProfilerState and trigger corresponding action.
+        """
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        self.previous_state = self.current_state
+        self.step_num += 1
+        self.current_state = self.scheduler(self.step_num)
+        self._trigger_action()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def _trigger_action(self):
+        if self.previous_state == ProfilerState.CLOSED:
+            if self.current_state == ProfilerState.READY:  # CLOSED -> READY
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # CLOSED -> RECORD
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # CLOSED -> RECORD_AND_RETURN
+                self.profiler.prepare()
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.READY:
+            if self.current_state == ProfilerState.CLOSED:  # READY -> CLOSED
+                warn(
+                    "Improper schedule: READY->CLOSED, profiler will start and stop without saving data"
+                )
+                self.profiler.start()
+                self.profiler.stop()
+            if self.current_state == ProfilerState.RECORD:  # READY -> RECORD
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # READY -> RECORD_AND_RETURN
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.RECORD:
+            if self.current_state == ProfilerState.CLOSED:  # RECORD -> CLOSED
+                warn(
+                    "Improper schedule: RECORD->CLOSED, profiler will not saving data"
+                )
+                self.profiler.stop()
+
+            if self.current_state == ProfilerState.READY:  # RECORD -> READY
+                warn(
+                    "Improper schedule: RECORD->READY, profiler will stop and re-prepare"
+                )
+                self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD -> RECORD_AND_RETURN
+                pass
+
+        else:
+            assert self.previous_state == ProfilerState.RECORD_AND_RETURN
+            if self.current_state == ProfilerState.CLOSED:  # RECORD_AND_RETURN -> CLOSED
+                self.profiler_result = self.profiler.stop()
+            if self.current_state == ProfilerState.READY:  # RECORD_AND_RETURN -> READY
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # RECORD_AND_RETURN -> RECORD
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD_AND_RETURN -> RECORD_AND_RETURN
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def export(self, path="", format="json"):
+        r"""
+        Exports the tracing data in Chrome tracing data format.
+        """
+        if self.profiler_result:
+            self.profiler_result.save(path, format)
+
+    def summary(self,
+                sorted_by=SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'):
+        r"""
+        Print the Summary table.
+
+        Parameters:
+            sorted_by: how to rank the op table items.
+            detail: expand each operator detail information.
+            thread_sep: print op table each thread.
+            time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
+        """
+        pass
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
new file mode 100644
index 00000000000..29d586268a0
--- /dev/null
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+from enum import Enum
+
+from paddle.fluid.core import TracerEventType
+
+
+class SortedKeys(Enum):
+    r"""
+    Sorted keys for printing summary table.
+    """
+    CPUTotal = 0
+    CPUAvg = 1
+    CPUMax = 2
+    CPUMin = 3
+    GPUTotal = 4
+    GPUAvg = 5
+    GPUMax = 6
+    GPUMin = 7
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
new file mode 100644
index 00000000000..642001dfbfc
--- /dev/null
+++ b/python/paddle/profiler/utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.core import (_RecordEvent, TracerEventType,
+                               load_profiler_result)
+from typing import Any
+from warnings import warn
+import functools
+from contextlib import ContextDecorator
+
+_AllowedEventTypeList = [
+    TracerEventType.Dataloader, TracerEventType.ProfileStep,
+    TracerEventType.UserDefined, TracerEventType.Forward,
+    TracerEventType.Backward, TracerEventType.Optimization,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+
+class RecordEvent(ContextDecorator):
+    r"""
+    Interface for recording a time range.
+
+    Parameters:
+    name(str): Name of the record event
+    event_type(TracerEventType): Type of the record event, can be used for statistics.
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
+            op1()
+    """
+
+    def __init__(self,
+                 name: str,
+                 event_type: TracerEventType=TracerEventType.UserDefined):
+        self.name = name
+        self.event_type = event_type
+        self.event = None
+
+    def __enter__(self):
+        self.begin()
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
+        self.end()
+
+    def begin(self):
+        if self.event_type not in _AllowedEventTypeList:
+            warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
+                  can be recorded.".format(*_AllowedEventTypeList))
+            self.event = None
+        else:
+            if self.event_type == TracerEventType.UserDefined:
+                self.event_type == TracerEventType.PythonUserDefined
+            self.event = _RecordEvent(self.name, self.event_type)
+
+    def end(self):
+        if self.event:
+            self.event.end()
+
+
+def wrap_optimizers():
+    def optimizer_warpper(func):
+        @functools.wraps(func)
+        def warpper(*args, **kwargs):
+            with RecordEvent(
+                    'Optimization Step',
+                    event_type=TracerEventType.Optimization):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    import paddle.optimizer as optimizer
+    for classname in optimizer.__all__:
+        if classname != 'Optimizer':
+            classobject = getattr(optimizer, classname)
+            if getattr(classobject, 'step', None) != None:
+                classobject.step = optimizer_warpper(classobject.step)
diff --git a/python/setup.py.in b/python/setup.py.in
index 0bc32cfbc00..118f617361f 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -372,6 +372,7 @@ packages=['paddle',
           'paddle.device',
           'paddle.device.cuda',
           'paddle.version',
+          'paddle.profiler'
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
-- 
GitLab


From 1f857cb966191e3e43de7950918595a6a4ca2db2 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 8 Mar 2022 09:55:54 +0800
Subject: [PATCH 165/272] add profiler statistic helper (#40111)

* add profiler helper

* fix unittest

* improve test coverage rate
---
 .../unittests/test_newprofiler_helper.py      | 137 +++++++++++
 python/paddle/profiler/statistic_helper.py    | 225 ++++++++++++++++++
 2 files changed, 362 insertions(+)
 create mode 100755 python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
 create mode 100644 python/paddle/profiler/statistic_helper.py

diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
new file mode 100755
index 00000000000..05e79200354
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.profiler.statistic_helper as statistic_helper
+
+
+class TestStatisticHelper(unittest.TestCase):
+    def test_sum_ranges_case1(self):
+        src = [(1, 3), (4, 10), (11, 15)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 12)
+
+    def test_sum_ranges_case2(self):
+        src = [(3, 3), (5, 5), (7, 7)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 0)
+
+    def test_merge_self_ranges_case1(self):
+        src = [(1, 5), (2, 7), (4, 9), (14, 19)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+        src = [(4, 9), (14, 19), (1, 5), (2, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+
+    def test_merge_self_ranges_case2(self):
+        src = [(1, 1), (2, 3), (4, 7), (5, 12)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+        src = [(5, 12), (1, 1), (2, 3), (4, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+
+    def test_merge_ranges_case1(self):
+        src1 = [(1, 2), (5, 7), (9, 14)]
+        src2 = [(1, 2), (4, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(1, 2), (3, 5)]
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        src1 = []
+        src2 = [(1, 2), (3, 5)]
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src2)
+        src1 = [(3, 4), (1, 2), (17, 19)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        src1 = [(1, 2), (5, 9), (12, 13)]
+        src2 = [(6, 8), (9, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+
+    def test_merge_ranges_case2(self):
+        src1 = [(3, 4), (1, 2), (9, 14)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 15)])
+        src2 = [(9, 14), (1, 2), (5, 7)]
+        src1 = [(4, 9), (1, 2), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+
+    def test_intersection_ranges_case1(self):
+        src1 = [(1, 7), (9, 12), (14, 18)]
+        src2 = [(3, 8), (10, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(3, 7), (10, 12)]
+        src2 = [(2, 9), (11, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+        dst = statistic_helper.intersection_ranges(src2, src1)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+
+    def test_intersection_ranges_case2(self):
+        src2 = [(9, 12), (1, 7), (14, 18)]
+        src1 = [(10, 13), (3, 8), (15, 19), (20, 22)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src2 = [(1, 7), (14, 18), (21, 23)]
+        src1 = [(6, 9), (10, 13)]
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(6, 7)])
+
+    def test_subtract_ranges_case1(self):
+        src1 = [(1, 10), (12, 15)]
+        src2 = [(3, 7), (9, 11)]
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 3), (7, 9), (12, 15)])
+        src1 = [(1, 10), (12, 15)]
+        src2 = []
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        dst = statistic_helper.subtract_ranges(src2, src1, True)
+        self.assertEqual(dst, src2)
+
+    def test_subtract_ranges_case2(self):
+        src2 = [(12, 15), (1, 10)]
+        src1 = [(9, 11), (3, 7)]
+        dst = statistic_helper.subtract_ranges(src1, src2)
+        self.assertEqual(dst, [(10, 11)])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py
new file mode 100644
index 00000000000..1f11649928a
--- /dev/null
+++ b/python/paddle/profiler/statistic_helper.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+
+def sum_ranges(ranges):
+    result = 0
+    for time_range in ranges:
+        result += (time_range[1] - time_range[0])
+    return result
+
+
+def merge_self_ranges(src_ranges, is_sorted=False):
+    merged_ranges = []
+    if len(src_ranges) > 0:
+        if not is_sorted:
+            src_ranges.sort(key=lambda x: x[0])
+        cur_indx = 0
+        merged_ranges.append((src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+        for cur_indx in range(1, len(src_ranges)):
+            if src_ranges[cur_indx][1] > merged_ranges[-1][1]:
+                if src_ranges[cur_indx][0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0],
+                                         src_ranges[cur_indx][1])
+                else:
+                    merged_ranges.append(
+                        (src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+    return merged_ranges
+
+
+def merge_ranges(range_list1, range_list2, is_sorted=False):
+    merged_ranges = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    if len1 == 0 and len2 == 0:
+        return merged_ranges
+    elif len1 == 0:
+        return range_list2
+    elif len2 == 0:
+        return range_list1
+    else:
+        indx1 = 0
+        indx2 = 0
+        range1 = range_list1[indx1]
+        range2 = range_list2[indx2]
+        if range1[0] < range2[0]:
+            merged_ranges.append(range1)
+            indx1 += 1
+        else:
+            merged_ranges.append(range2)
+            indx2 += 1
+        while indx1 < len1 and indx2 < len2:
+            range1 = range_list1[indx1]
+            range2 = range_list2[indx2]
+            if range1[0] < range2[0]:
+                if range1[1] > merged_ranges[-1][1]:
+                    if range1[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                    else:
+                        merged_ranges.append((range1[0], range1[1]))
+                    indx1 += 1
+                else:
+                    indx1 += 1
+            else:
+                if range2[1] > merged_ranges[-1][1]:
+                    if range2[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                    else:
+                        merged_ranges.append((range2[0], range2[1]))
+                    indx2 += 1
+                else:
+                    indx2 += 1
+
+        while indx1 < len1:
+            range1 = range_list1[indx1]
+            if range1[1] > merged_ranges[-1][1]:
+                if range1[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                else:
+                    merged_ranges.append((range1[0], range1[1]))
+                indx1 += 1
+            else:
+                indx1 += 1
+        while indx2 < len2:
+            if range2[1] > merged_ranges[-1][1]:
+                if range2[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                else:
+                    merged_ranges.append((range2[0], range2[1]))
+                indx2 += 1
+            else:
+                indx2 += 1
+    return merged_ranges
+
+
+def intersection_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if len(range_list1) == 0 or len(range_list2) == 0:
+        return result_range
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+    while indx1 < len1 and indx2 < len2:
+        if range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            assert (range2[1] > range1[0])
+            result_range.append((range1[0], range2[1]))
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            result_range.append(range1)
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        elif range2[1] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append(range2)
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] < range1[1]:
+            assert (range2[1] >= range1[1])
+            result_range.append((range2[0], range1[1]))
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        else:
+            assert (range2[0] >= range1[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+    return result_range
+
+
+def subtract_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    if len(range_list1) == 0:
+        return result_range
+    if len(range_list2) == 0:
+        return range_list1
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+
+    while indx1 < len(range_list1):
+        if indx2 == len(range_list2):
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+        elif range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+        elif range2[0] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append((range1[0], range2[0]))
+            range1 = (range2[0], range1[1])
+        else:
+            assert (range2[0] >= range1[1])
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+    return result_range
-- 
GitLab


From fe1cc8bd43472f6b9eb413a6ae88144517b9bf8a Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Tue, 8 Mar 2022 10:01:54 +0800
Subject: [PATCH 166/272] [phi] move sigmoid_cross_entopy_with_logits log_loss
 cumsum auc infershape to phi (#40200)

* move infershapes to phi

* update code format

* update code format
---
 paddle/fluid/operators/cumsum_op.cc           | 20 ++---
 paddle/fluid/operators/log_loss_op.cc         | 45 ++--------
 paddle/fluid/operators/metrics/auc_op.cc      | 72 ++--------------
 .../sigmoid_cross_entropy_with_logits_op.cc   | 50 ++---------
 paddle/phi/infermeta/binary.cc                | 83 +++++++++++++++++++
 paddle/phi/infermeta/binary.h                 | 14 ++++
 paddle/phi/infermeta/multiary.cc              | 80 ++++++++++++++++++
 paddle/phi/infermeta/multiary.h               | 12 +++
 paddle/phi/infermeta/unary.cc                 | 18 ++++
 paddle/phi/infermeta/unary.h                  |  7 ++
 10 files changed, 244 insertions(+), 157 deletions(-)

diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 7c80917a713..11633fb0b87 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,17 +24,6 @@ namespace operators {
 class CumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->Attrs().Get<bool>("flatten")) {
-      ctx->SetOutputDim("Out",
-                        phi::make_ddim({phi::product(ctx->GetInputDim("X"))}));
-    } else {
-      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    }
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,10 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-
+DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor,
+                            PD_INFER_META(phi::CumsumInferMeta));
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
-                  ops::CumsumGradMaker<paddle::imperative::OpBase>);
+                  ops::CumsumGradMaker<paddle::imperative::OpBase>,
+                  CumsumInferShapeFunctor);
 
 REGISTER_OP_VERSION(cumsum)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 2e596ff3e62..883e3597d8a 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,43 +24,6 @@ namespace operators {
 class LogLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime() ||
-        (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims, label_dims,
-          platform::errors::InvalidArgument(
-              "The dimensions of Input(Predicted) must be equal to the"
-              "dimensions of Input(Labels), but received dimensions of "
-              "Input(Predicted)"
-              "is [%s], received dimensions of Input(Labels) is [%s].",
-              pred_dims, label_dims));
-    }
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(Predicted) must be 2,"
-                          "But received dimensions of Input(Predicted)"
-                          "is [%d]",
-                          pred_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "Each row of Input(Predicted) contains a real value, "
-              "so the 2nd dimension of Input(X) must be 1,"
-              "But got [%d]",
-              pred_dims[1]));
-    }
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Predicted", "Loss");
-  }
 };
 
 template <typename AttrType>
@@ -145,7 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor,
+                            PD_INFER_META(phi::LogLossInferMeta));
 REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
                   ops::LogLossGradMaker<paddle::framework::OpDesc>,
-                  ops::LogLossGradMaker<paddle::imperative::OpBase>);
+                  ops::LogLossGradMaker<paddle::imperative::OpBase>,
+                  LogLossInferShapeFunctor);
 REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 54ecba08a82..f3ed98c3f4d 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc");
-    auto predict_dims = ctx->GetInputDim("Predict");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_GE(
-        predict_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape size must be "
-            "greater_equal 2.",
-            predict_dims));
-    auto predict_width = predict_dims[1];
-    PADDLE_ENFORCE_NE(
-        phi::product(predict_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape can not involes 0.",
-            predict_dims));
-    PADDLE_ENFORCE_NE(
-        phi::product(label_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Label) has not been initialized properly. The "
-            "shape of Input(Label) = [%s], the shape can not involes 0.",
-            label_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_LE(predict_width, 2,
-                        platform::errors::InvalidArgument(
-                            "Only support binary classification,"
-                            "prediction dims[1] should be 1 or 2"));
-    }
-    auto predict_height = ctx->GetInputDim("Predict")[0];
-    auto label_height = ctx->GetInputDim("Label")[0];
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(predict_height, label_height,
-                        platform::errors::InvalidArgument(
-                            "Out and Label should have same height."));
-    }
-
-    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
-    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
-
-    PADDLE_ENFORCE_GE(
-        num_pred_buckets, 1,
-        platform::errors::InvalidArgument("num_thresholds must larger than 1"));
-    PADDLE_ENFORCE_GE(slide_steps, 0,
-                      platform::errors::InvalidArgument(
-                          "slide_steps must be natural number"));
-
-    ctx->SetOutputDim("AUC", {1});
-
-    if (slide_steps) {
-      ctx->SetOutputDim("StatPosOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-      ctx->SetOutputDim("StatNegOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-    } else {
-      ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets});
-      ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -145,4 +84,7 @@ There are two types of possible curves:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor,
+                            PD_INFER_META(phi::AucInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker,
+                             AucInferShapeFunctor);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 8e502fc04db..016ff54645b 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,46 +29,6 @@ const int kIgnoreIndex = -100;
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "SigmoidCrossEntropyWithLogitsOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same rank."
-                          "But received: the rank of Input(X) is [%d], "
-                          "the rank of Input(Label) is [%d].",
-                          rank, labels_dims.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank),
-          phi::slice_ddim(labels_dims, 0, rank),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) shall have the same shape "
-              "except the last dimension. But received: the shape of "
-              "Input(X) is [%s], the shape of Input(Label) is [%s].",
-              x_dims, labels_dims));
-    }
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class SigmoidCrossEntropyWithLogitsGradOp
@@ -201,12 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(
+    sigmoid_cross_entropy_with_logits,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor,
+    PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta));
 REGISTER_OPERATOR(
     sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp,
     ops::SigmoidCrossEntropyWithLogitsOpMaker,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::framework::OpDesc>,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::imperative::OpBase>,
-    ops::SigmoidCrossEntropyWithLogitsInplaceInferer);
+    ops::SigmoidCrossEntropyWithLogitsInplaceInferer,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp,
                   ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer);
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 55230aa8d05..b17405990fb 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -575,6 +575,48 @@ void GatherTreeMeta(const MetaTensor& ids,
   out->set_dims(ids_dims);
 }
 
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto pred_dims = input.dims();
+  auto label_dims = label.dims();
+
+  if (config.is_runtime ||
+      (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
+    PADDLE_ENFORCE_EQ(
+        pred_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The dimensions of Input(Predicted) must be equal to the"
+            "dimensions of Input(Labels), but received dimensions of "
+            "Input(Predicted)"
+            "is [%s], received dimensions of Input(Labels) is [%s].",
+            pred_dims,
+            label_dims));
+  }
+  PADDLE_ENFORCE_EQ(pred_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(Predicted) must be 2,"
+                        "But received dimensions of Input(Predicted)"
+                        "is [%d]",
+                        pred_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(pred_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "Each row of Input(Predicted) contains a real value, "
+                          "so the 2nd dimension of Input(X) must be 1,"
+                          "But got [%d]",
+                          pred_dims[1]));
+  }
+  out->set_dims({pred_dims[0], 1});
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   auto dim_x = x.dims();
   auto dim_vec = vec.dims();
@@ -605,4 +647,45 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   out->share_lod(x);
 }
 
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config) {
+  auto x_dims = x.dims();
+  auto labels_dims = label.dims();
+  int rank = x_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    labels_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        labels_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        phi::slice_ddim(x_dims, 0, rank),
+        phi::slice_ddim(labels_dims, 0, rank),
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Label) shall have the same shape "
+            "except the last dimension. But received: the shape of "
+            "Input(X) is [%s], the shape of Input(Label) is [%s].",
+            x_dims,
+            labels_dims));
+  }
+
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 106c22f7548..934ed688bf2 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -89,6 +89,7 @@ void BincountInferMeta(const MetaTensor& x,
                        const paddle::optional<const MetaTensor&> weights,
                        int minlength,
                        MetaTensor* out);
+
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -102,6 +103,19 @@ void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out);
 
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
 
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index a21f077c09f..acce40713b8 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -28,6 +28,86 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config) {
+  auto predict_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_GE(
+      predict_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape size must be "
+          "greater_equal 2.",
+          predict_dims));
+  auto predict_width = predict_dims[1];
+  PADDLE_ENFORCE_NE(
+      phi::product(predict_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape can not involes 0.",
+          predict_dims));
+  PADDLE_ENFORCE_NE(
+      phi::product(label_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Label) has not been initialized properly. The "
+          "shape of Input(Label) = [%s], the shape can not involes 0.",
+          label_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_LE(
+        predict_width,
+        2,
+        phi::errors::InvalidArgument("Only support binary classification,"
+                                     "prediction dims[1] should be 1 or 2"));
+  }
+  auto predict_height = input.dims()[0];
+  auto label_height = label.dims()[0];
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        predict_height,
+        label_height,
+        phi::errors::InvalidArgument("Out and Label should have same height."));
+  }
+
+  int num_pred_buckets = num_thresholds + 1;
+
+  PADDLE_ENFORCE_GE(
+      num_pred_buckets,
+      1,
+      phi::errors::InvalidArgument("num_thresholds must larger than 1"));
+  PADDLE_ENFORCE_GE(
+      slide_steps,
+      0,
+      phi::errors::InvalidArgument("slide_steps must be natural number"));
+
+  auc->set_dims({1});
+  auc->set_dtype(DataType::INT64);
+
+  if (slide_steps) {
+    stat_pos_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_neg_out->set_dtype(DataType::INT64);
+  } else {
+    stat_pos_out->set_dims({1, num_pred_buckets});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({1, num_pred_buckets});
+    stat_neg_out->set_dtype(DataType::INT64);
+  }
+}
+
 void AdamaxInferMeta(const MetaTensor& param,
                      const MetaTensor& grad,
                      const MetaTensor& learning_rate,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 8cb6f70481d..26bdc62302f 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -20,6 +20,18 @@ namespace phi {
 
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index b9eb5196b1e..4053cfbc362 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -156,6 +156,24 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out) {
+  auto x_dims = x.dims();
+  if (flatten) {
+    out->set_dims(phi::make_ddim({phi::product(x_dims)}));
+    out->set_dtype(x.dtype());
+  } else {
+    out->set_dims(x_dims);
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       product(x.dims()),
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 37b17f6e3d1..a679ef8c11a 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -63,6 +63,13 @@ void CopyToInferMeta(const MetaTensor& x,
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out);
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
 void InferMetaFromVecValue(const MetaTensor& x,
-- 
GitLab


From 0c33c47ee752befb54b6a16f6608cb3c411506d9 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Tue, 8 Mar 2022 10:21:48 +0800
Subject: [PATCH 167/272] fix paddle.median torch diff (#40118)

---
 python/paddle/tensor/stat.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 468aa460486..dd0da03e4fd 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -321,6 +321,9 @@ def median(x, axis=None, keepdim=False, name=None):
             paddle.slice(
                 tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]),
             dtype=dtype)
+    out_tensor = out_tensor + paddle.sum(
+        paddle.cast(
+            paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
     if not keepdim or is_flatten:
         if not is_flatten:
             newshape = x.shape[:axis] + x.shape[axis + 1:]
-- 
GitLab


From 81d4142b97e3758f7a526066dd0414ec8b306098 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Tue, 8 Mar 2022 10:23:01 +0800
Subject: [PATCH 168/272] [Phi] move InferShape for truncated_gaussian_random
 and gaussian_random (#40191)

* [Phi] move InferShape for truncated_gaussian_random and gaussian_random

* [Phi] delete useless codes
---
 paddle/fluid/operators/gaussian_random_op.cc  | 47 +++++--------------
 .../operators/truncated_gaussian_random_op.cc | 36 +++++---------
 paddle/phi/infermeta/nullary.cc               | 25 ++++++++++
 paddle/phi/infermeta/nullary.h                | 14 ++++++
 .../cpu/truncated_gaussian_random_kernel.cc   |  2 +-
 .../gpu/truncated_gaussian_random_kernel.cu   |  3 +-
 .../truncated_gaussian_random_kernel.h        |  5 +-
 7 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 6b559885c56..66eecc13d04 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,12 +15,14 @@ limitations under the License. */
 #include <random>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -54,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom");
-
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    if (shape.empty() && ctx->HasInput("ShapeTensor")) {
-      auto shape_dims = ctx->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_dims));
-
-      return;
-    }
-    if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
-      PADDLE_ENFORCE_GT(
-          shape.size(), 0UL,
-          platform::errors::InvalidArgument(
-              "Attribute(shape) of GaussianRandomOp must be set "
-              "and shape.size() > 0, but reveived shape.size() is %d",
-              shape.size()));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(temp));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -171,11 +141,20 @@ Used to initialize tensors with gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
-                             ops::GaussianRandomOpMaker);
+
+DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor,
+                            PD_INFER_META(phi::GaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    GaussianRandomInferShapeFunctor);
+
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
+
 REGISTER_OP_VERSION(gaussian_random)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 6eb7f922dfd..dc5a66dce16 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -17,8 +17,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound(
-            "Output(Out) of TruncatedGaussianRandomOp should not be null."));
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> out_dim;
-    out_dim.reserve(shape.size());
-    for (auto dim : shape) {
-      out_dim.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE_GT(
-        shape.size(), 0UL,
-        platform::errors::InvalidArgument(
-            "the input shape of TruncatedGaussianRandomOp must be set, "
-            "But the rank of shape we received is %d",
-            shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dim));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
-                             ops::TruncatedGaussianRandomOp,
-                             ops::TruncatedGaussianRandomOpMaker);
+
+DECLARE_INFER_SHAPE_FUNCTOR(
+    truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor,
+    PD_INFER_META(phi::TruncatedGaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    truncated_gaussian_random, ops::TruncatedGaussianRandomOp,
+    ops::TruncatedGaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TruncatedGaussianRandomInferShapeFunctor);
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 0c48c9d0c7e..506d3fd14ea 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -40,4 +40,29 @@ void EyeInferMeta(int64_t num_rows,
   out->set_dims({num_rows, num_columns});
   out->set_dtype(dtype);
 }
+
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape.GetData());
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index 40d6ea595c0..bd0567486e4 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -40,4 +40,18 @@ void EyeInferMeta(int64_t num_rows,
                   DataType dtype,
                   MetaTensor* out);
 
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out);
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
index ebc032ef545..4247e597ace 100644
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -27,7 +27,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 12c1bf791e1..f27b32ca7b8 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -25,7 +25,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/fluid/framework/generator.h"
-// #include "paddle/phi/core/generator.h"
 
 namespace phi {
 
@@ -87,7 +86,7 @@ struct TruncatedNormalOffset {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
index 0370cc431fe..f8547ced419 100644
--- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace phi {
 
@@ -157,8 +158,8 @@ struct TruncatedNormal {
 };
 
 template <typename T, typename Context>
-void TruncatedGaussianRandomKernel(const Context& ctx,
-                                   const ScalarArray& shape,
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
-- 
GitLab


From 413a743e7f5e0436db60b7d1718cc0353488062a Mon Sep 17 00:00:00 2001
From: tanzhipeng <51696454+tiancaitzp@users.noreply.github.com>
Date: Tue, 8 Mar 2022 10:29:32 +0800
Subject: [PATCH 169/272] remove unnecessary constant fill in sequence conv
 test=kunlun. (#40126)

---
 .../fluid/operators/sequence_ops/sequence_conv_op_xpu.cc  | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 6c33ff52044..23c6a0133e1 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
         col_data, paddle::platform::errors::Fatal("XPU memory is not enough"));
 
     if (in_g || filter_g) {
-      int r = xpu::constant<T>(xpu_context, col_data, col_numel, T(0));
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
       bool trans_a = false;
       bool trans_b = true;
       int m = out_g->dims()[0];
@@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
       const T* data_b = filter->data<T>();
       T* data_c = col_data;
 
-      r = xpu::fc_fusion<T, T, T, int32_t>(
+      int r = xpu::fc_fusion<T, T, T, int32_t>(
           xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b,
           nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr,
           xpu::Activation_t::LINEAR);
@@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      xpu::constant<T>(xpu_context, in_g->data<T>(), in_g->numel(), T(0));
 
       int r = xpu::sequence_context_projection_grad<T, int>(
           xpu_context, in_g->data<T>(), col_data, nullptr, lodx, sequence_width,
@@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      xpu::constant<T>(xpu_context, filter_g->data<T>(), filter_g->numel(),
-                       T(0));
 
       int r = xpu::sequence_context_projection<T, int>(
           xpu_context, in->data<T>(), col_data, nullptr, lodx, sequence_width,
-- 
GitLab


From 6bd2d2b1cb5fa2e350adb4c9b291b48054257be5 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Tue, 8 Mar 2022 10:29:59 +0800
Subject: [PATCH 170/272] [Phi] move the  graph_send_recv op to the phi 
 (#40092)

* [Phi] transfer old kernel to pten kernel for the graph_send_recv op

* update the code for the define of graph_send_recv

* fix the gradient problem for graph_send_recv

* fix the compile problem

* update the enfore message for the windows

* update the code for the compiler

* update compiler problem for the windows

* udpate the code for windows

* fix some format problem
---
 paddle/fluid/operators/graph_send_recv_op.cc  |  12 +-
 paddle/fluid/operators/graph_send_recv_op.cu  | 419 ------------------
 paddle/fluid/operators/graph_send_recv_op.h   | 291 ------------
 .../phi/kernels/cpu/graph_send_recv_funcs.h   |  80 ++++
 .../cpu/graph_send_recv_grad_kernel.cc        | 172 +++++++
 .../phi/kernels/cpu/graph_send_recv_kernel.cc | 153 +++++++
 .../phi/kernels/gpu/graph_send_recv_funcs.h   | 171 +++++++
 .../gpu/graph_send_recv_grad_kernel.cu        | 148 +++++++
 .../phi/kernels/gpu/graph_send_recv_kernel.cu | 179 ++++++++
 .../phi/kernels/graph_send_recv_grad_kernel.h |  33 ++
 paddle/phi/kernels/graph_send_recv_kernel.h   |  31 ++
 paddle/phi/ops/compat/graph_send_recv_sig.cc  |  31 ++
 12 files changed, 999 insertions(+), 721 deletions(-)
 delete mode 100644 paddle/fluid/operators/graph_send_recv_op.cu
 delete mode 100644 paddle/fluid/operators/graph_send_recv_op.h
 create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_funcs.h
 create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_funcs.h
 create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
 create mode 100644 paddle/phi/kernels/graph_send_recv_grad_kernel.h
 create mode 100644 paddle/phi/kernels/graph_send_recv_kernel.h
 create mode 100644 paddle/phi/ops/compat/graph_send_recv_sig.cc

diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index 6af8388d9eb..b759345eda5 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/graph_send_recv_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -171,13 +171,3 @@ REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
                   ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
                   ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
-REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel<CPU, float>,
-                       ops::GraphSendRecvOpKernel<CPU, double>,
-                       ops::GraphSendRecvOpKernel<CPU, int>,
-                       ops::GraphSendRecvOpKernel<CPU, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(graph_send_recv_grad,
-                       ops::GraphSendRecvGradOpKernel<CPU, float>,
-                       ops::GraphSendRecvGradOpKernel<CPU, double>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
deleted file mode 100644
index f43d31814ac..00000000000
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/graph_send_recv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename IndexT>
-struct GraphSendRecvSumCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMaxCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMinCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-__global__ void GraphSendRecvCUDAKernel(const T* params,
-                                        const IndexT* src_indices,
-                                        const IndexT* dst_indices, T* output,
-                                        size_t index_size, size_t slice_size,
-                                        Functor functor) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    functor(params, output, in_i, out_i);
-  }
-}
-
-// For max
-template <typename T>
-__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::min()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// For min
-template <typename T>
-__global__ void InputResetMinCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::max()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// Get dst_count
-template <typename T, typename IndexT>
-__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices,
-                                       size_t index_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
-    IndexT dst_i = dst_indices[i];
-    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
-  }
-}
-
-// For forward mean
-template <typename T>
-__global__ void ManipulateMeanCUDAKernel(T* output, int* count,
-                                         size_t input_size, size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    int64_t c_index = i / slice_size;
-    if (*(count + c_index) > 1) {
-      *(output + i) = *(output + i) / *(count + c_index);
-    }
-  }
-}
-
-// For backward mean
-template <typename T, typename IndexT>
-__global__ void ManipulateMeanGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const int* dst_count) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(output + out_i,
-                                    *(params + in_i) / dst_count[src_i]);
-  }
-}
-
-// For backward min and max
-template <typename T, typename IndexT>
-__global__ void ManipulateMinMaxGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const T* ptr_input,
-    const T* ptr_output) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(
-        output + out_i,
-        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* Y = ctx.Output<Tensor>("Out");
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  if (pool_type == "SUM" || pool_type == "MEAN") {
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_output, 0, memset_bytes);
-#else
-    cudaMemset(p_output, 0, memset_bytes);
-#endif
-  } else if (pool_type == "MAX") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::min());
-  } else if (pool_type == "MIN") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::max());
-  }
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MAX") {
-    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMaxCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_max =
-        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
-    InputResetMaxCUDAKernel<
-        T><<<grid_max, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MIN") {
-    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMinCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_min =
-        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
-    InputResetMinCUDAKernel<
-        T><<<grid_min, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MEAN") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_dst_count, 0, input_size * sizeof(int));
-#else
-    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
-#endif
-
-    int64_t grid_count = (index_size + block - 1) / block;
-    ComputeCountCUDAKernel<
-        T, IndexT><<<grid_count, block, 0,
-                     reinterpret_cast<const platform::CUDADeviceContext&>(
-                         ctx.device_context())
-                         .stream()>>>(p_dst_count, d_index, index_size);
-
-    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_mean =
-        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
-    ManipulateMeanCUDAKernel<
-        T><<<grid_mean, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, p_dst_count, input_size, slice_size);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-
-#ifdef PADDLE_WITH_HIP
-  hipMemset(p_output, 0, memset_bytes);
-#else
-  cudaMemset(p_output, 0, memset_bytes);
-#endif
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    ManipulateMeanGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, s_count);
-  } else if (pool_type == "MAX" || pool_type == "MIN") {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Input<Tensor>("Out");
-    const T* ptr_input = input->data<T>();
-    const T* ptr_output = output->data<T>();
-    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, ptr_input,
-                                         ptr_output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto* dst_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h
deleted file mode 100644
index 8d8111e0ee8..00000000000
--- a/paddle/fluid/operators/graph_send_recv_op.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct GraphSendRecvSumFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    eigen_dst += eigen_src;
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMinFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMin(eigen_src);
-    }
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMaxFunctor {
-  void operator()(const int& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMax(eigen_src);
-    }
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-void elementwise_inner_operation(const Tensor& src, Tensor* dst,
-                                 const IndexT& src_index,
-                                 const IndexT& dst_index,
-                                 const bool& first_flag, Functor functor) {
-  auto src_slice = src.Slice(src_index, src_index + 1);
-  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
-
-  functor(first_flag, src_slice, &dst_slice);
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size,
-                                  const IndexT* s_index, const IndexT* d_index,
-                                  const Tensor& src, Tensor* dst,
-                                  const std::string& pool_type,
-                                  int* dst_count = nullptr) {
-  Functor functor;
-  if (pool_type == "SUM") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-    for (int i = 0; i < index_size; ++i) {
-      IndexT dst_idx = d_index[i];
-      *(dst_count + dst_idx) += 1;
-    }
-    for (int i = 0; i < input_size; ++i) {
-      if (*(dst_count + i) == 0) continue;
-      auto dst_slice = dst->Slice(i, i + 1);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    std::set<IndexT> existed_dst;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
-      if (!in_set) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                        dst_idx, true, functor);
-        existed_dst.emplace(dst_idx);
-      } else {
-        elementwise_inner_operation<T, IndexT, Functor>(
-            src, dst, src_idx, dst_idx, false, functor);
-      }
-    }
-  }
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop_grad(
-    const int& input_size, const int& index_size, const IndexT* s_index,
-    const IndexT* d_index, const Tensor& src, Tensor* dst,
-    const std::string& pool_type, const int* dst_count = nullptr,
-    const Tensor* input = nullptr, const Tensor* output = nullptr) {
-  if (pool_type == "SUM") {
-    Functor functor;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      auto src_slice = src.Slice(src_idx, src_idx + 1);
-      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& forward_src_idx = d_index[i];
-      const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
-      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);
-      auto eigen_output = framework::EigenVector<T>::Flatten(output_slice);
-
-      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += eigen_src * (eigen_output == eigen_input);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                       const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* dst_index = ctx.Input<Tensor>("Dst_index");
-  auto* Y = ctx.Output<Tensor>("Out");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MIN") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MAX") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
-        p_dst_count);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* dst_index = ctx.Input<Tensor>("Src_index");
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    const auto* input = ctx.Input<Tensor>("X");
-    const auto* output = ctx.Input<Tensor>("Out");
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
-        input, output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
-                                                                   *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx,
-                                                                   *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
new file mode 100644
index 00000000000..df6d9c87be0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct GraphSendRecvSumFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    eigen_dst += eigen_src;
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMinFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMin(eigen_src);
+    }
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMaxFunctor {
+  void operator()(const int& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMax(eigen_src);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+void ElementwiseInnerOperation(const DenseTensor& src,
+                               DenseTensor* dst,
+                               const IndexT& src_index,
+                               const IndexT& dst_index,
+                               const bool& first_flag,
+                               Functor functor) {
+  auto src_slice = src.Slice(src_index, src_index + 1);
+  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
+
+  functor(first_flag, src_slice, &dst_slice);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
new file mode 100644
index 00000000000..8538461b1b8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuGradLoop(const int& input_size,
+                              const int& index_size,
+                              const IndexT* s_index,
+                              const IndexT* d_index,
+                              const DenseTensor& src,
+                              DenseTensor* dst,
+                              const std::string& pool_type,
+                              const int* dst_count = nullptr,
+                              const DenseTensor* input = nullptr,
+                              const DenseTensor* output = nullptr) {
+  if (pool_type == "SUM") {
+    Functor functor;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      auto src_slice = src.Slice(src_idx, src_idx + 1);
+      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& forward_src_idx = d_index[i];
+      const IndexT& forward_dst_idx = s_index[i];
+      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
+      auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
+
+      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += eigen_src * (eigen_output == eigen_input);
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+  } else if (pool_type == "MEAN") {
+    const int* s_count = dst_count->data<int>();
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    s_count);
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    nullptr,
+                                                                    x,
+                                                                    out);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
new file mode 100644
index 00000000000..fecbd4b1d7a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <set>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuLoop(const int& input_size,
+                          const int& index_size,
+                          const IndexT* s_index,
+                          const IndexT* d_index,
+                          const DenseTensor& src,
+                          DenseTensor* dst,
+                          const std::string& pool_type,
+                          int* dst_count = nullptr) {
+  Functor functor;
+  if (pool_type == "SUM") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+    for (int i = 0; i < index_size; ++i) {
+      IndexT dst_idx = d_index[i];
+      *(dst_count + dst_idx) += 1;
+    }
+    for (int i = 0; i < input_size; ++i) {
+      if (*(dst_count + i) == 0) continue;
+      auto dst_slice = dst->Slice(i, i + 1);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
+      } else {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
+      }
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
+                                       const DenseTensor& x,
+                                       const DenseTensor& src_index,
+                                       const DenseTensor& dst_index,
+                                       const std::string& pool_type,
+                                       DenseTensor* out,
+                                       DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MEAN") {
+    ctx.template Alloc<int>(dst_count);
+    int* p_dst_count = dst_count->data<int>();
+    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                index_size,
+                                                                s_index,
+                                                                d_index,
+                                                                x,
+                                                                out,
+                                                                pool_type,
+                                                                p_dst_count);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
new file mode 100644
index 00000000000..1eab521170b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+struct GraphSendRecvSumCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMaxCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMinCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+__global__ void GraphSendRecvCUDAKernel(const T* params,
+                                        const IndexT* src_indices,
+                                        const IndexT* dst_indices,
+                                        T* output,
+                                        size_t index_size,
+                                        size_t slice_size,
+                                        Functor functor) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    functor(params, output, in_i, out_i);
+  }
+}
+
+// For max
+template <typename T>
+__global__ void InputResetMaxCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::min()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// For min
+template <typename T>
+__global__ void InputResetMinCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::max()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// Get dst_count
+template <typename T, typename IndexT>
+__global__ void ComputeCountCUDAKernel(int32_t* count,
+                                       const IndexT* dst_indices,
+                                       size_t index_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
+    IndexT dst_i = dst_indices[i];
+    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
+  }
+}
+
+// For forward mean
+template <typename T>
+__global__ void ManipulateMeanCUDAKernel(T* output,
+                                         int32_t* count,
+                                         size_t input_size,
+                                         size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    int64_t c_index = i / slice_size;
+    if (*(count + c_index) > 1) {
+      *(output + i) = *(output + i) / *(count + c_index);
+    }
+  }
+}
+
+// For backward mean
+template <typename T, typename IndexT>
+__global__ void ManipulateMeanGradCUDAKernel(const T* params,
+                                             const IndexT* src_indices,
+                                             const IndexT* dst_indices,
+                                             T* output,
+                                             size_t index_size,
+                                             size_t slice_size,
+                                             const int32_t* dst_count) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(output + out_i,
+                                    *(params + in_i) / dst_count[src_i]);
+  }
+}
+
+// For backward min and max
+template <typename T, typename IndexT>
+__global__ void ManipulateMinMaxGradCUDAKernel(const T* params,
+                                               const IndexT* src_indices,
+                                               const IndexT* dst_indices,
+                                               T* output,
+                                               size_t index_size,
+                                               size_t slice_size,
+                                               const T* ptr_input,
+                                               const T* ptr_output) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(
+        output + out_i,
+        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
new file mode 100644
index 00000000000..75692966b46
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpCUDAKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+
+#ifdef PADDLE_WITH_HIP
+  hipMemset(p_output, 0, memset_bytes);
+#else
+  cudaMemset(p_output, 0, memset_bytes);
+#endif
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = out_grad.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MEAN") {
+    const int32_t* s_count = dst_count->data<int32_t>();
+    ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
+  } else if (pool_type == "MAX" || pool_type == "MIN") {
+    const T* ptr_input = x->data<T>();
+    const T* ptr_output = out->data<T>();
+    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src,
+        d_index,
+        s_index,
+        p_output,
+        index_size,
+        slice_size,
+        ptr_input,
+        ptr_output);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
new file mode 100644
index 00000000000..fab306f831a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
+                                           const DenseTensor& x,
+                                           const DenseTensor& src_index,
+                                           const DenseTensor& dst_index,
+                                           const std::string& pool_type,
+                                           DenseTensor* out,
+                                           DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  if (pool_type == "SUM" || pool_type == "MEAN") {
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_output, 0, memset_bytes);
+#else
+    cudaMemset(p_output, 0, memset_bytes);
+#endif
+  } else if (pool_type == "MAX") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::min());
+  } else if (pool_type == "MIN") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::max());
+  }
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = x.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMaxCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_max =
+        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
+    InputResetMaxCUDAKernel<T><<<grid_max, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMinCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_min =
+        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
+    InputResetMinCUDAKernel<T><<<grid_min, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MEAN") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    ctx.template Alloc<int32_t>(dst_count);
+    int32_t* p_dst_count = dst_count->data<int32_t>();
+
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#else
+    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
+#endif
+
+    int64_t grid_count = (index_size + block - 1) / block;
+    ComputeCountCUDAKernel<T, IndexT><<<grid_count, block, 0, ctx.stream()>>>(
+        p_dst_count, d_index, index_size);
+
+    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_mean =
+        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
+    ManipulateMeanCUDAKernel<T><<<grid_mean, block, 0, ctx.stream()>>>(
+        p_output, p_dst_count, input_size, slice_size);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
new file mode 100644
index 00000000000..d163e6e278a
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
new file mode 100644
index 00000000000..95dbdc4443a
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
new file mode 100644
index 00000000000..dacb8b25a89
--- /dev/null
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GraphSendRecvGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "graph_send_recv_grad",
+      {GradVarName("Out"), "X", "Out", "Src_index", "Dst_index", "Dst_count"},
+      {"pool_type"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
+                           phi::GraphSendRecvGradOpArgumentMapping);
-- 
GitLab


From c722ee690dd75389bf000cd5435f5f4519c4b7a2 Mon Sep 17 00:00:00 2001
From: maxhuiy <1508399706@qq.com>
Date: Tue, 8 Mar 2022 10:35:27 +0800
Subject: [PATCH 171/272] [MLU] add fleet init api and collective api pytest
 for mlu (#40010)

* [MLU] add fleet init api and collective api pytest for mlu

* fix no value for argument 'data_type' in method call
---
 python/paddle/distributed/collective.py       |   4 +
 python/paddle/distributed/parallel.py         |  14 +-
 python/paddle/fluid/dygraph/parallel.py       |   3 +
 .../fluid/tests/unittests/mlu/CMakeLists.txt  |  12 +-
 .../tests/unittests/mlu/c_comm_init_op_mlu.py |  71 ++++++
 .../unittests/mlu/collective_allreduce_api.py |  54 +++++
 .../unittests/mlu/collective_broadcast_api.py |  54 +++++
 .../unittests/mlu/test_c_comm_init_op_mlu.sh  |  21 ++
 .../mlu/test_collective_allreduce_api_mlu.py  |  43 ++++
 .../mlu/test_collective_api_base_mlu.py       | 223 ++++++++++++++++++
 .../mlu/test_collective_broadcast_api_mlu.py  |  43 ++++
 11 files changed, 535 insertions(+), 7 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 8042aced6bb..bf6556d21e9 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -267,6 +267,10 @@ def new_group(ranks=None, backend=None):
                 place = core.NPUPlace(genv.device_id)
                 core.HCCLParallelContext(strategy,
                                          place).init_with_ring_id(ring_id)
+            elif core.is_compiled_with_mlu():
+                place = core.MLUPlace(genv.device_id)
+                core.CNCLParallelContext(strategy,
+                                         place).init_with_ring_id(ring_id)
             else:
                 assert False, ("no cuda device found")
         else:
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 177e19194a5..16ed528b64f 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -58,9 +58,9 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and (
+    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and (
             core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
-            core.is_compiled_with_npu()):
+            core.is_compiled_with_npu() or core.is_compiled_with_mlu()):
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -152,7 +152,8 @@ def init_parallel_env():
     is_cpu_only = _is_cpuonly(backend)
     # 1. gpu xpu check, must be gpu or xpu, 
     if not (is_cpu_only or core.is_compiled_with_cuda() or
-            core.is_compiled_with_xpu() or core.is_compiled_with_npu()):
+            core.is_compiled_with_xpu() or core.is_compiled_with_npu() or
+            core.is_compiled_with_mlu()):
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
@@ -162,6 +163,8 @@ def init_parallel_env():
         _check_var_exists('FLAGS_selected_xpus')
     elif not is_cpu_only and core.is_compiled_with_npu():
         _check_var_exists('FLAGS_selected_npus')
+    elif not is_cpu_only and core.is_compiled_with_mlu():
+        _check_var_exists('FLAGS_selected_mlus')
 
     _check_var_exists("PADDLE_TRAINER_ID")
     _check_var_exists("PADDLE_CURRENT_ENDPOINT")
@@ -213,6 +216,8 @@ def init_parallel_env():
         place = core.XPUPlace(parallel_env.device_id)
     elif core.is_compiled_with_npu():
         place = core.NPUPlace(parallel_env.device_id)
+    elif core.is_compiled_with_mlu():
+        place = core.MLUPlace(parallel_env.device_id)
 
     _set_expected_place(place)
     # init nccl or hccl or bkcl or heter context
@@ -231,6 +236,9 @@ def init_parallel_env():
     elif core.is_compiled_with_npu():
         parallel_helper._set_parallel_ctx(
             core.HCCLParallelContext(strategy, place))
+    elif core.is_compiled_with_mlu():
+        parallel_helper._set_parallel_ctx(
+            core.CNCLParallelContext(strategy, place))
 
     if backend != "heter":
         other_endpoints = strategy.trainer_endpoints[:]
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 0049f387b70..652916491ee 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -128,6 +128,9 @@ class ParallelEnv(object):
         elif core.is_compiled_with_npu():
             selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
             self._device_id = int(selected_npus[0])
+        elif core.is_compiled_with_mlu():
+            selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
+            self._device_id = int(selected_mlus[0])
 
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index c17790bd320..17f5509bdb9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -13,13 +13,17 @@ if (WITH_MLU)
     endforeach(TEST_OP)
 
     if(WITH_CNCL)
-	foreach(TEST_OP ${TEST_DIST_OPS})
+        foreach(TEST_OP ${TEST_DIST_OPS})
             py_test_modules(${TEST_OP} MODULES ${TEST_OP})
         endforeach(TEST_OP)
         bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
     endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
new file mode 100644
index 00000000000..e91f28e3b1d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+import paddle
+
+paddle.enable_static()
+
+
+class TestCCommInitOp(unittest.TestCase):
+    def setUp(self):
+        self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
+        self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        self.nranks = len(self.endpoints)
+        self.rank = self.endpoints.index(self.current_endpoint)
+        self.mlu_id = int(os.getenv("FLAGS_selected_mlus"))
+        self.place = fluid.MLUPlace(self.mlu_id)
+        self.exe = fluid.Executor(self.place)
+        self.endpoints.remove(self.current_endpoint)
+        self.other_endpoints = self.endpoints
+        if self.rank == 0:
+            wait_server_ready(self.other_endpoints)
+
+    def test_specifying_devices(self):
+        program = fluid.Program()
+        block = program.global_block()
+        cncl_id_var = block.create_var(
+            name=fluid.unique_name.generate('cncl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': self.rank,
+                'endpoint': self.current_endpoint,
+                'other_endpoints': self.other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': self.nranks,
+                'rank': self.rank,
+                'ring_id': 0,
+                'device_id': self.mlu_id
+            })
+        self.exe.run(program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
new file mode 100644
index 00000000000..ebe4e71d22f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
new file mode 100644
index 00000000000..2002909ea2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype="float32")
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
new file mode 100644
index 00000000000..97f21798c11
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# use default values
+# FIXME: random fails on Unknown command lines -c (or -m).
+launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
+MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
new file mode 100644
index 00000000000..447498b9022
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_cncl_fp16(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_cncl_fp32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_cncl_int32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
new file mode 100644
index 00000000000..556fc6fcbb7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import os
+import sys
+import subprocess
+import pickle
+from contextlib import closing
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+def DataTypeCast(date_type):
+    np_data_type = None
+
+    if date_type == "float16":
+        np_data_type = np.float16
+    elif date_type == "float32":
+        np_data_type = np.float32
+    elif date_type == "int32":
+        np_data_type = np.int32
+    else:
+        raise ValueError("This data type is not support!")
+
+    return np_data_type
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank, indata=None):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        paddle.distributed.init_parallel_env()
+        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
+        place = fluid.MLUPlace(device_id)
+        np.random.seed(os.getpid())
+        np_data_type = DataTypeCast(args["data_type"])
+        indata = np.random.random((10, 1000)).astype(np_data_type)
+        if args['static_mode']:
+            result = self.get_model(train_prog, startup_prog, rank)
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            fetch_list = []
+            for elem in result:
+                fetch_list.append(elem.name)
+            out = exe.run(train_prog,
+                          feed={'tindata': indata},
+                          fetch_list=fetch_list)
+        else:
+            out = self.get_model(train_prog, startup_prog, rank, indata)
+            #print(out, sys.stderr)
+        sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    args["static_mode"] = int(os.getenv("STATIC_MODE"))
+    args["data_type"] = os.getenv("DATA_TYPE")
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_mlus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_mlus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            tr_cmd = "%s -m coverage run --branch -p %s"
+        else:
+            tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
+        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         data_type,
+                         path_id="0",
+                         static_mode="1",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
+            "STATIC_MODE": static_mode,
+            "PADDLE_WITH_GLOO": '0',
+            "BACKEND": "cncl",
+            "PATH_ID": path_id,
+            "DATA_TYPE": data_type
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np_data_type = DataTypeCast(data_type)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000)).astype(np_data_type)
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000)).astype(np_data_type)
+        if col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
new file mode 100644
index 00000000000..95919f33328
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_cncl_fp16(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float16")
+
+    def test_broadcast_cncl_fp32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float32")
+
+    def test_broadcast_cncl_int32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From c39aa18e0d3fe4eddd72ff1d07839655a8af8dbb Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Tue, 8 Mar 2022 10:48:33 +0800
Subject: [PATCH 172/272] [custom kernel]Upgrade support for multiple libs
 (#40223)

* [custom kernel]Upgade support for multi libs

* upgrade phi_custom_kernel deps
---
 paddle/fluid/framework/CMakeLists.txt       |  2 +-
 paddle/fluid/inference/api/CMakeLists.txt   |  2 +-
 paddle/fluid/platform/CMakeLists.txt        |  2 +-
 paddle/fluid/platform/init.cc               |  2 +-
 paddle/phi/core/CMakeLists.txt              |  2 +-
 paddle/phi/core/custom_kernel.cc            | 71 ++++++++-------------
 paddle/phi/core/custom_kernel.h             | 14 ++--
 paddle/phi/core/kernel_registry.h           |  3 +-
 paddle/phi/kernels/CMakeLists.txt           |  2 +-
 paddle/phi/kernels/sparse/CMakeLists.txt    |  2 +-
 paddle/phi/tests/core/CMakeLists.txt        |  2 +-
 paddle/phi/tests/core/test_custom_kernel.cc |  4 +-
 paddle/testing/CMakeLists.txt               |  2 +-
 13 files changed, 45 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e486799495c..aa92a3b2226 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 6eeb5d64253..1f83e606c3f 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
-    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel)
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
 
 if(WITH_CRYPTO)
     list(APPEND paddle_inference_api_deps paddle_crypto)
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 5a47443fd0b..04c8a329e5e 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -117,7 +117,7 @@ endif()
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # seperate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context phi_custom_kernel)
+cc_library(init SRCS init.cc DEPS device_context custom_kernel)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index cf85dede8e8..293a71dbd96 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -154,8 +154,8 @@ void LoadCustomDevice(const std::string &library_dir) {
             "Fail to open library: %s with error: %s", lib_path, dlerror()));
 
     phi::LoadCustomRuntimeLib(lib_path, dso_handle);
-    phi::LoadCustomKernelLib(lib_path, dso_handle);
   }
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
   LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir
             << "]";
 }
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 424c4ce2ebc..b4a6b54d0fe 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
 cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
-cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils op_registry phi_tensor_raw)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index a333874d03e..bc317da8d98 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -12,21 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
 #include "paddle/phi/core/custom_kernel.h"
 
 namespace phi {
 
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
-  auto& kernel_info_map = custom_kernel_map.GetMap();
-  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+void CustomKernelMap::RegisterCustomKernel(const std::string& name,
+                                           const KernelKey& key,
+                                           const Kernel& kernel) {
+  PADDLE_ENFORCE_EQ(kernels_[name].find(key),
+                    kernels_[name].end(),
+                    phi::errors::AlreadyExists(
+                        "The custom kernel [%s:%s] has been already existed in "
+                        "CustomKernelMap, please check if any duplicate kernel "
+                        "info in your lib(s) before load again.",
+                        name,
+                        key));
+  kernels_[name][key] = kernel;
+}
+
+void CustomKernelMap::RegisterCustomKernels() {
+  VLOG(3) << "Size of custom_kernel_map: " << kernels_.size();
 
   auto& kernels = KernelFactory::Instance().kernels();
-  for (auto& pair : kernel_info_map) {
+  for (auto& pair : kernels_) {
     PADDLE_ENFORCE_NE(
         kernels.find(pair.first),
         kernels.end(),
@@ -38,8 +46,8 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
       PADDLE_ENFORCE_EQ(
           kernels[pair.first].find(info_pair.first),
           kernels[pair.first].end(),
-          phi::errors::InvalidArgument(
-              "The operator <%s>'s kernel: %s has been already existed "
+          phi::errors::AlreadyExists(
+              "The kernel [%s:%s] has been already existed "
               "in Paddle, please contribute PR if it is necessary "
               "to optimize the kernel code. Custom kernel does NOT support "
               "to replace existing kernel in Paddle.",
@@ -48,43 +56,14 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
 
       kernels[pair.first][info_pair.first] = info_pair.second;
 
-      VLOG(3) << "Successed in registering operator <" << pair.first
-              << ">'s kernel: " << info_pair.first
-              << " to Paddle. It will be used like native ones.";
+      VLOG(3) << "Successed in registering kernel [" << pair.first << ":"
+              << info_pair.first
+              << "] to Paddle. It will be used like native ones.";
     }
+    kernels_[pair.first].clear();
   }
+  LOG(INFO) << "Successed in loading custom kernels.";
+  kernels_.clear();
 }
 
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
-#ifdef _LINUX
-  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
-  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
-      dlsym(dso_handle, "PD_GetCustomKernelMap"));
-
-  if (func == nullptr) {
-    LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetCustomKernelMap symbol in this lib.";
-    return;
-  }
-  auto& custom_kernel_map = func();
-  phi::RegisterCustomKernels(custom_kernel_map);
-  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
-#else
-  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
-#endif
-  return;
-}
 }  // namespace phi
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global CustomKernelMap.
-phi::CustomKernelMap& PD_GetCustomKernelMap() {
-  return phi::CustomKernelMap::Instance();
-}
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
index ffd12b9dd03..5ba14de6a61 100644
--- a/paddle/phi/core/custom_kernel.h
+++ b/paddle/phi/core/custom_kernel.h
@@ -29,6 +29,12 @@ class CustomKernelMap {
     return g_custom_kernel_info_map;
   }
 
+  void RegisterCustomKernel(const std::string& kernel_name,
+                            const KernelKey& kernel_key,
+                            const Kernel& kernel);
+
+  void RegisterCustomKernels();
+
   KernelNameMap& Kernels() { return kernels_; }
 
   const KernelNameMap& GetMap() const { return kernels_; }
@@ -40,12 +46,4 @@ class CustomKernelMap {
   KernelNameMap kernels_;
 };
 
-/**
- * Note:
- * Used to register custom kernels to KernelFactory.
- */
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
-
-// Load custom kernel lib and register
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 6a0c7bbc9b7..d9ed68593cd 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -210,7 +210,8 @@ struct KernelRegistrar {
     if (reg_type == RegType::INNER) {
       KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
     } else {
-      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+      CustomKernelMap::Instance().RegisterCustomKernel(
+          kernel_name, kernel_key, kernel);
     }
   }
 };
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 16fae8d879c..58ea231beef 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 # [ 1. Common kernel compilation dependencies ]
-set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index a319e9a13c3..eaea6d95216 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 5356bac9fbd..de9bd7a4d47 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel)
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index a4e89231e14..6fe34a6891a 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -172,7 +172,9 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
+
+  EXPECT_EQ(0, static_cast<int>(custom_fake_dot_kernels.size()));
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index eace7c41f4a..0cc68bf3161 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,5 +1,5 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc)
 endif()
-- 
GitLab


From d4b007af8bfa82df134220690115fcd58122de26 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 8 Mar 2022 10:53:28 +0800
Subject: [PATCH 173/272] add share dims (#40238)

---
 paddle/fluid/framework/infershape_utils.cc | 20 +++++++------
 paddle/phi/core/meta_tensor.cc             | 35 +++++++++++++++++-----
 paddle/phi/core/meta_tensor.h              |  3 +-
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 7232a707916..91ef59575c3 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -232,16 +232,8 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
-  void share_meta(const MetaTensor& meta_tensor) override {
+  void share_dims(const MetaTensor& meta_tensor) override {
     set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
-    // special case 1: share lod of LoDTensor
-    share_lod(meta_tensor);
-
-    // special case 2: share height and rows of SelectedRows in runtime
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       if (var->IsType<phi::SelectedRows>()) {
@@ -254,6 +246,16 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
+  void share_meta(const MetaTensor& meta_tensor) override {
+    set_dtype(meta_tensor.dtype());
+    // VarDesc doesn't contains layout, so we cannot share layout
+    // set_layout(meta_tensor.layout());
+
+    // special case 1: share lod of LoDTensor
+    share_lod(meta_tensor);
+    share_dims(meta_tensor);
+  }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 2aadce4feda..eb114304f53 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -98,13 +98,9 @@ const LoD& MetaTensor::lod() const {
 }
 
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
-  if (phi::DenseTensor::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    set_layout(meta_tensor.layout());
-    share_lod(meta_tensor);
-  } else if (phi::SelectedRows::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
+  if (phi::DenseTensor::classof(tensor_) ||
+      phi::SelectedRows::classof(tensor_)) {
+    share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
     set_layout(meta_tensor.layout());
     share_lod(meta_tensor);
@@ -114,4 +110,29 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
+TensorBase* MetaTensor::get_tensor() const { return tensor_; }
+
+void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
+  bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
+  bool is_selected_rows = phi::SelectedRows::classof(tensor_);
+  if (is_dense_tensor || is_selected_rows) {
+    set_dims(meta_tensor.dims());
+    if (is_selected_rows) {
+      const auto in_tensor_base = meta_tensor.get_tensor();
+      PADDLE_ENFORCE_EQ(
+          phi::SelectedRows::classof(in_tensor_base),
+          true,
+          errors::InvalidArgument("The input MetaTensor is SelectedRows, but "
+                                  "the output MetaTensor is not this type."));
+      auto* selected_rows_out = static_cast<SelectedRows*>(tensor_);
+      auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base);
+      selected_rows_out->set_rows(selected_rows_in->rows());
+      selected_rows_out->set_height(selected_rows_in->height());
+    }
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Unsupported sharing dims for `%s`.", tensor_->type_info().name()));
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 1a32019a190..3971a9f7e99 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -60,12 +60,13 @@ class MetaTensor {
 
   virtual void share_lod(const MetaTensor& meta_tensor);
   virtual void share_meta(const MetaTensor& meta_tensor);
+  virtual void share_dims(const MetaTensor& meta_tensor);
 
  private:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
-
+  TensorBase* get_tensor() const;
   TensorBase* tensor_;
 };
 
-- 
GitLab


From f876320a9836a6a12ab6e8b3ddb079fc2ae6e746 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 8 Mar 2022 11:15:39 +0800
Subject: [PATCH 174/272] support code auto-gene for sparse backward api
 (#40196)

---
 .gitignore                                    |   2 +
 paddle/phi/api/lib/CMakeLists.txt             |  26 ++-
 .../paddle/utils/code_gen/backward_api_gen.py |   1 +
 .../paddle/utils/code_gen/sparse_api_gen.py   |   9 +-
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |   6 +
 .../utils/code_gen/sparse_bw_api_gen.py       | 200 ++++++++++++++++++
 6 files changed, 235 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/utils/code_gen/sparse_bw_api.yaml
 create mode 100644 python/paddle/utils/code_gen/sparse_bw_api_gen.py

diff --git a/.gitignore b/.gitignore
index a2009a1ed30..21222678f04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/*
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/phi/api/backward/backward_api.h
+paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
 paddle/phi/api/include/sparse_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
 paddle/phi/api/lib/sparse_api.cc
+paddle/phi/api/lib/sparse_bw_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 4f449c578ba..926ddf8ba49 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -40,6 +40,14 @@ set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
 set(sparse_api_header_file_tmp ${api_header_file}.tmp)
 set(sparse_api_source_file_tmp ${api_source_file}.tmp)
 
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
+
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -91,7 +99,20 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
   COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
-  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file}
+  VERBATIM)
+
+# generate backward sparse api
+add_custom_command(
+  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file}
+                 --api_yaml_path ${sparse_bw_api_yaml_file}
+                 --api_header_path ${sparse_bw_api_header_file_tmp}
+                 --api_source_path ${sparse_bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file}
+  COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
+  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
   VERBATIM)
 
 # generate wrapped infermeta
@@ -113,9 +134,10 @@ cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfe
 cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 
-cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
 cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index b9f991f9b0f..7bd488cc114 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -35,6 +35,7 @@ class BackwardAPI(BaseAPI):
             forward_config)
         api = result.group('api')
         _, outputs, _ = self.parse_output(self.api, result.group('outputs'))
+        outputs = [item.split('@')[0] for item in outputs]
         fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
             api, result.group('args'))
 
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index 99c5a4f49f8..d845653f488 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -17,10 +17,10 @@ import yaml
 import argparse
 import re
 
-from api_base import BaseAPI
+from api_gen import ForwardAPI
 
 
-class SparseAPI(BaseAPI):
+class SparseAPI(ForwardAPI):
     def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
@@ -30,11 +30,6 @@ class SparseAPI(BaseAPI):
     def get_api_func_name(self):
         return self.api
 
-    def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::tuple<" + ",".join(
-                out_type_list) + ">"
-
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
new file mode 100644
index 00000000000..c71dce50299
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -0,0 +1,6 @@
+- sparse_bw_api : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups)
+  output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
+  kernel :
+    func : sparse_conv_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
new file mode 100644
index 00000000000..6ef294caa14
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from sparse_api_gen import SparseAPI
+from backward_api_gen import BackwardAPI
+
+
+class SparseBackwardAPI(SparseAPI, BackwardAPI):
+    def __init__(self, bw_api_item_yaml):
+        BackwardAPI.__init__(self, bw_api_item_yaml)
+
+    def get_api_name(self, api_item_yaml):
+        return api_item_yaml['sparse_bw_api']
+
+    def get_api_func_name(self):
+        return self.api
+
+    def get_return_type(self, out_type_list):
+        return BackwardAPI.get_return_type(self, out_type_list)
+
+    def gene_api_declaration(self):
+        return SparseAPI.gene_api_declaration(self)
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} out{inplace_assign};
+  auto kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} out({len(output_type_list)});"""
+
+            for i, out_type_item in enumerate(output_type_list):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if out_type_item == 'Tensor':
+                    get_out_code = f'&out[{i}][0]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+
+                    else:
+                        output_create = output_create + f"""
+  out[{i}].emplace_back();"""
+
+                else:
+                    get_out_code = f'&out[{i}]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  out[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+
+def header_include():
+    return """
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/declarations.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/backward/sparse_bw_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_bw_api = SparseBackwardAPI(api)
+        header_file.write(sparse_bw_api.gene_api_declaration())
+        source_file.write(sparse_bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_bw_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/backward/sparse_bw_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_bw_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab


From 3c536f2e65c65cc986e8aaff86214426498d1f7a Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Tue, 8 Mar 2022 13:11:12 +0800
Subject: [PATCH 175/272] =?UTF-8?q?[phi]=20move=20isnan=5Fv2=E3=80=81isfin?=
 =?UTF-8?q?ite=5Fv2=E3=80=81isinf=5Fv2=20to=20phi=20(#40076)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support isfinite for phi

* mark v2

* fixed bugs

* fixed include bugs

* deal with comments

* decoupling selected_rows

* rm bfloat16

* fixed infermeta

* fixed code style

* rm useless code

* replace pt by pd
---
 paddle/fluid/operators/isfinite_v2_op.cc      | 64 ++++---------
 paddle/fluid/operators/isfinite_v2_op.cu      | 55 -----------
 paddle/phi/core/compat/op_utils.h             |  3 +
 paddle/phi/infermeta/unary.cc                 |  5 +
 paddle/phi/infermeta/unary.h                  |  2 +
 paddle/phi/kernels/cpu/isfinite_kernel.cc     | 62 ++++++++++++
 .../kernels/funcs/isfinite_functor.h}         | 33 +++----
 paddle/phi/kernels/gpu/isfinite_kernel.cu     | 61 ++++++++++++
 .../phi/kernels/impl/isfinite_kernel_impl.h   | 39 ++++++++
 paddle/phi/kernels/isfinite_kernel.h          | 31 ++++++
 .../kernels/selected_rows/isfinite_kernel.cc  | 96 +++++++++++++++++++
 .../kernels/selected_rows/isfinite_kernel.h   | 31 ++++++
 .../selected_rows/isfinite_kernel_impl.h      | 39 ++++++++
 paddle/phi/ops/compat/isfinite_sig.cc         | 19 ++++
 14 files changed, 419 insertions(+), 121 deletions(-)
 delete mode 100644 paddle/fluid/operators/isfinite_v2_op.cu
 create mode 100644 paddle/phi/kernels/cpu/isfinite_kernel.cc
 rename paddle/{fluid/operators/isfinite_v2_op.h => phi/kernels/funcs/isfinite_functor.h} (52%)
 create mode 100644 paddle/phi/kernels/gpu/isfinite_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/isfinite_kernel_impl.h
 create mode 100644 paddle/phi/kernels/isfinite_kernel.h
 create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel.cc
 create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel.h
 create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/isfinite_sig.cc

diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 735fffa7203..cfa370ff9cb 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel {
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
-    UnaryOpUnchangedInferShape(ctx);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -104,6 +101,14 @@ element of X as a tensor.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
 
 #define REGISTER_V2OP_MAKER(op_type, comment)           \
   namespace paddle {                                    \
@@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
 REGISTER_OPERATOR(
     isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsinfInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsnanInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(isnan_v2,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsfiniteInferShapeFunctor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
deleted file mode 100644
index 1b9f19d36df..00000000000
--- a/paddle/fluid/operators/isfinite_v2_op.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(isnan_v2,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 8f64a7145ed..9947e00ecb5 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -40,6 +40,9 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
 const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "flatten",
                                                            "flatten_grad",
+                                                           "isinf",
+                                                           "isnan",
+                                                           "isfinite",
                                                            "matmul",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 4053cfbc362..17edc846187 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1007,6 +1007,11 @@ void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
   out->set_dims({1});
 }
 
+void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a679ef8c11a..dac7c19cf9b 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -157,6 +157,8 @@ void PixelShuffleInferMeta(const MetaTensor& x,
                            const std::string& data_format,
                            MetaTensor* out);
 
+void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
+
 void TransposeInferMeta(const MetaTensor& x,
                         const std::vector<int>& axis,
                         MetaTensor* out);
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
new file mode 100644
index 00000000000..33a7429a22a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/phi/kernels/funcs/isfinite_functor.h
similarity index 52%
rename from paddle/fluid/operators/isfinite_v2_op.h
rename to paddle/phi/kernels/funcs/isfinite_functor.h
index b646e460ec7..c804bee8d4c 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,39 +14,32 @@
 
 #pragma once
 
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace operators {
+namespace funcs {
 
 struct InfinityV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsInfV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsInfV2(tensor, out);
   }
 };
 
 struct NANV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsNANV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsNANV2(tensor, out);
   }
 };
 
 struct IsfiniteV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorIsfiniteV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorIsfiniteV2(tensor, out);
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
new file mode 100644
index 00000000000..4b41ed1e55d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
new file mode 100644
index 00000000000..affa85f8a2d
--- /dev/null
+++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out);
+
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor)            \
+  template <typename T, typename Context>                           \
+  void isfinite_kernel(                                             \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out) { \
+    IsfiniteKernelImpl<T, Context, functor>(ctx, x, out);           \
+  }
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/isfinite_kernel.h b/paddle/phi/kernels/isfinite_kernel.h
new file mode 100644
index 00000000000..e695a8e0742
--- /dev/null
+++ b/paddle/phi/kernels/isfinite_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel) \
+  template <typename T, typename Context>       \
+  void isfinite_kernel(                         \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel)
+DEFINE_ISFINITE_KERNEL(IsnanKernel)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
new file mode 100644
index 00000000000..a507cdd0d86
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& dev_ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x.value(), out->mutable_value());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(isinf_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.h b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
new file mode 100644
index 00000000000..948d8c89477
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+#define DEFINE_ISFINITE_SR(isfinite_sr)   \
+  template <typename T, typename Context> \
+  void isfinite_sr(                       \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out);
+
+DEFINE_ISFINITE_SR(IsinfSR)
+DEFINE_ISFINITE_SR(IsnanSR)
+DEFINE_ISFINITE_SR(IsfiniteSR)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
new file mode 100644
index 00000000000..c53abdf996c
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out);
+
+#define DEFINE_ISFINITE_SR(isfinite_sr, functor)                      \
+  template <typename T, typename Context>                             \
+  void isfinite_sr(                                                   \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out) { \
+    IsfiniteSRImpl<T, Context, functor>(ctx, x, out);                 \
+  }
+
+DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor)
+DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/isfinite_sig.cc b/paddle/phi/ops/compat/isfinite_sig.cc
new file mode 100644
index 00000000000..218b4c2f962
--- /dev/null
+++ b/paddle/phi/ops/compat/isfinite_sig.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf);
+PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan);
+PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite);
-- 
GitLab


From 13f2b1e381d6ab112dd431bdb415e7ea04fbb7b7 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Tue, 8 Mar 2022 13:29:26 +0800
Subject: [PATCH 176/272] [phi] transfer accuracy op and pass the unittests
 (#39982)

* transfer accuracy op and pass the ci

* remove header file

* fix code

* fix code

* fix

* fix
---
 paddle/fluid/operators/metrics/accuracy_op.cc |   9 +-
 paddle/fluid/operators/metrics/accuracy_op.cu | 110 ----------------
 paddle/fluid/operators/metrics/accuracy_op.h  |  74 -----------
 .../operators/metrics/accuracy_op_mlu.cc      |   3 +-
 .../operators/metrics/accuracy_op_npu.cc      |   2 +-
 .../operators/metrics/accuracy_op_xpu.cc      |   4 +-
 paddle/phi/kernels/accuracy_kernel.h          |  30 +++++
 paddle/phi/kernels/cpu/accuracy_kernel.cc     |  72 +++++++++++
 paddle/phi/kernels/gpu/accuracy_kernel.cu     | 117 ++++++++++++++++++
 9 files changed, 228 insertions(+), 193 deletions(-)
 delete mode 100644 paddle/fluid/operators/metrics/accuracy_op.cu
 delete mode 100644 paddle/fluid/operators/metrics/accuracy_op.h
 create mode 100644 paddle/phi/kernels/accuracy_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/accuracy_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/accuracy_kernel.cu

diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 3692ace8bb5..056620db5b9 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -123,13 +123,10 @@ with the input Out(Inference).
 }  // namespace operators
 }  // namespace paddle
 
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int.
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
deleted file mode 100644
index 6f19100fa9d..00000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/reduce.h>
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D,
-                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy, int* total_data) {
-  int count = 0;
-  __shared__ int total[BlockSize];
-
-  // support only 1 block
-  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-    for (int j = 0; j < D; ++j) {
-      if (Xdata[i * D + j] == labeldata[i]) {
-        ++count;
-        break;
-      }
-    }
-  }
-  total[threadIdx.x] = count;
-  __syncthreads();
-
-// reduce the count with init value 0, and output accuracy.
-#ifdef PADDLE_WITH_CUDA
-  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
-#else
-  // HIP thrust::reduce not support __device__
-  for (int s = BlockSize / 2; s > 0; s >>= 1) {
-    if (threadIdx.x < s) {
-      total[threadIdx.x] += total[threadIdx.x + s];
-    }
-    __syncthreads();
-  }
-  int result = total[0];
-#endif
-  if (threadIdx.x == 0) {
-    *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
-    *total_data = N;
-  }
-}
-
-template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-    // FIXME(typhoonzero): only support indices currently
-    // if add support for output values, how to detect the data type?
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    int num_samples = static_cast<int>(inference->dims()[0]);
-    size_t infer_width = inference->dims()[1];
-    auto stream = ctx.cuda_device_context().stream();
-    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    AccuracyCudaKernel<
-        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data, total_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-// FIXME(typhoonzero): types of T is for inference data.
-// label data is always int64
-REGISTER_OP_CUDA_KERNEL(
-    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-    paddle::operators::AccuracyOpCUDAKernel<double>,
-    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h
deleted file mode 100644
index 94e5bf8257e..00000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AccuracyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    size_t num_samples = inference->dims()[0];
-    size_t class_dim = inference->dims()[1];
-    *accuracy_data = 0.0f;
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    int num_correct = 0;
-    // assume inference is already the topk of the output
-    for (size_t i = 0; i < num_samples; ++i) {
-      PADDLE_ENFORCE_GE(
-          label_data[i], 0,
-          platform::errors::InvalidArgument(
-              "label of AccuracyOp must >= 0, But received label[%d] is %d", i,
-              label_data[i]));
-      for (size_t j = 0; j < class_dim; ++j) {
-        if (indices_data[i * class_dim + j] == label_data[i]) {
-          ++num_correct;
-          break;
-        }
-      }
-    }
-
-    *correct_data = num_correct;
-    *total_data = num_samples;
-    *accuracy_data =
-        static_cast<float>(num_correct) / static_cast<float>(num_samples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 2598d3b0277..1ce02ff4525 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index e83278f88b8..9f2ca4165f3 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index de71312d78d..3cc1be4de8a 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = paddle::framework::Tensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/phi/kernels/accuracy_kernel.h b/paddle/phi/kernels/accuracy_kernel.h
new file mode 100644
index 00000000000..8f2dbb96f86
--- /dev/null
+++ b/paddle/phi/kernels/accuracy_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total);
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
new file mode 100644
index 00000000000..c57ec69b73a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  size_t num_samples = inference.dims()[0];
+  size_t class_dim = inference.dims()[1];
+  *accuracy_data = 0.0f;
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  int num_correct = 0;
+  // assume inference is already the topk of the output
+  for (size_t i = 0; i < num_samples; ++i) {
+    PADDLE_ENFORCE_GE(
+        label_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "label of AccuracyOp must >= 0, But received label[%d] is %d",
+            i,
+            label_data[i]));
+    for (size_t j = 0; j < class_dim; ++j) {
+      if (indices_data[i * class_dim + j] == label_data[i]) {
+        ++num_correct;
+        break;
+      }
+    }
+  }
+
+  *correct_data = num_correct;
+  *total_data = num_samples;
+  *accuracy_data =
+      static_cast<float>(num_correct) / static_cast<float>(num_samples);
+}
+}  // namespace phi
+
+// TODO(add supported dtype.)
+PD_REGISTER_KERNEL(
+    accuracy, CPU, ALL_LAYOUT, phi::AccuracyRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
new file mode 100644
index 00000000000..f08fb74e54d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   float* accuracy,
+                                   int* total_data) {
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+// reduce the count with init value 0, and output accuracy.
+#ifdef PADDLE_WITH_CUDA
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+#else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+#endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<
+      PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      num_samples,
+      infer_width,
+      indices_data,
+      label_data,
+      correct_data,
+      accuracy_data,
+      total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyRawKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
-- 
GitLab


From a279a4f8576667bf86258f1e9e59b6a05b6ce00e Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Tue, 8 Mar 2022 13:57:42 +0800
Subject: [PATCH 177/272] [IPU] update ipu unittests p2 (#40069)

* update ipu UTs part2

* clean git

* rename ut

* rename ut 1

* sync api changes

* update uts for new api

* update uts for new api

* fix re-define
---
 .../tests/unittests/ipu/test_ipu_pipeline.py  |  71 ------
 .../tests/unittests/ipu/test_ipu_place.py     |  51 -----
 .../tests/unittests/ipu/test_ipu_shard.py     |  70 ------
 .../unittests/ipu/test_ipu_shard_api_ipu.py   | 112 ++++++++++
 .../tests/unittests/ipu/test_ipu_strategy.py  |  56 -----
 .../unittests/ipu/test_ipu_strategy_ipu.py    |  72 ++++++
 .../unittests/ipu/test_layernorm_op_ipu.py    | 134 +++++++----
 .../unittests/ipu/test_log_softmax_op_ipu.py  |  87 ++++----
 .../unittests/ipu/test_logical_not_op_ipu.py  |  97 ++++++++
 .../unittests/ipu/test_lookuptable_op_ipu.py  | 102 ++++-----
 .../ipu/test_lookuptable_v2_op_ipu.py         | 141 ++++++++++++
 ...lr_sheduelr.py => test_lr_sheduler_ipu.py} |   6 +-
 .../tests/unittests/ipu/test_matmul_op_ipu.py | 208 ++++++++++--------
 13 files changed, 723 insertions(+), 484 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_lr_sheduelr.py => test_lr_sheduler_ipu.py} (95%)

diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
deleted file mode 100644
index beab68553d7..00000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            c = b + 1  # scale, ipu_stage : 1
-            with paddle.fluid.ipu_shard(ipu_stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.fluid.ipu_shard(ipu_stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.fluid.ipu_shard(ipu_stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.fluid.ipu_shard(ipu_stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.fluid.ipu_shard(ipu_stage=2):
-            g = f - 1  # scale, ipu_stage : 2
-
-        h = g + 1  # scale, ipu_stage : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_stage"):
-                ipu_index_list.append(op.desc.attr("ipu_stage"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
deleted file mode 100644
index 48ab046deb3..00000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-sys.path.append("..")
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuPlace(unittest.TestCase):
-    def test_ipu_place(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            place = paddle.IPUPlace()
-            p = fluid.core.Place()
-            p.set_place(place)
-            self.assertTrue(p.is_ipu_place())
-
-    def test_ipu_set_device(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            paddle.set_device('ipu')
-            device = paddle.get_device()
-            self.assertTrue(device == "ipus:{{0-{}}}".format(num_devices - 1))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
deleted file mode 100644
index 368556d8b2f..00000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            c = b + 1  # scale, ipu_index : 1
-            with paddle.fluid.ipu_shard(ipu_index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.fluid.ipu_shard(ipu_index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.fluid.ipu_shard(ipu_index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.fluid.ipu_shard(ipu_index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
-
-        with paddle.fluid.ipu_shard(ipu_index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_index"):
-                ipu_index_list.append(op.desc.attr("ipu_index"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
new file mode 100644
index 00000000000..026b19eccf1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -0,0 +1,112 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuShard(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_index : no
+
+        with paddle.static.ipu_shard_guard(index=1):
+            c = b + 1  # scale, ipu_index : 1
+            with paddle.static.ipu_shard_guard(index=2):
+                d = c * 2  # scale, ipu_index : 2
+            with paddle.static.ipu_shard_guard(index=3):
+                e = d + 3  # scale, ipu_index : 3
+                with paddle.static.ipu_shard_guard(index=1):
+                    e = e + 3  # scale, ipu_index : 1
+                    with paddle.static.ipu_shard_guard(index=2):
+                        e = e + 3  # scale, ipu_index : 2
+
+        with paddle.static.ipu_shard_guard(index=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
+
+        with paddle.static.ipu_shard_guard(index=2):
+            g = f - 1  # scale, ipu_index : 2
+
+        h = g + 1  # scale, ipu_index : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuPipeline(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_stage : no
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            c = b + 1  # scale, ipu_stage : 1
+            with paddle.static.ipu_shard_guard(stage=2):
+                d = c * 2  # scale, ipu_stage : 2
+            with paddle.static.ipu_shard_guard(stage=3):
+                e = d + 3  # scale, ipu_stage : 3
+                with paddle.static.ipu_shard_guard(stage=1):
+                    e = e + 3  # scale, ipu_stage : 1
+                    with paddle.static.ipu_shard_guard(stage=2):
+                        e = e + 3  # scale, ipu_stage : 2
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
+
+        with paddle.static.ipu_shard_guard(stage=2):
+            g = f - 1  # scale, ipu_stage : 2
+
+        h = g + 1  # scale, ipu_stage : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_stage"):
+                ipu_index_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
deleted file mode 100644
index afeec9ee1b6..00000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestConvNet(unittest.TestCase):
-    def test_training(self):
-        ipu_strategy = paddle.static.IpuStrategy()
-
-        assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1"
-        assert ipu_strategy.is_training == True, "Default is_training is True"
-        assert ipu_strategy.enable_pipelining == False, \
-            "Default enable_pipelining is False"
-        assert ipu_strategy.enable_manual_shard == False, \
-            "Default enable_manual_shard is False"
-
-        ipu_strategy.SetGraphConfig(
-            num_ipus=2, is_training=False, enable_manual_shard=True)
-        ipu_strategy.SetPipeliningConfig(enable_pipelining=True)
-        assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed"
-
-        assert ipu_strategy.is_training == False, "Set is_training Failed"
-
-        assert ipu_strategy.enable_pipelining == True, \
-            "Set enable_pipelining Failed"
-
-        assert ipu_strategy.enable_manual_shard == True, \
-            "Set enable_manual_shard Failed"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
new file mode 100644
index 00000000000..f120f559491
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -0,0 +1,72 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuStrategy(unittest.TestCase):
+    def test_set_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        for option_name in all_option_names:
+            option = ipu_strategy._ipu_strategy.get_option(option_name)
+            option_type = option['type']
+            option_value = option['value']
+            if option_type in ['double']:
+                set_value = option_value + 0.5
+            elif option_type == 'uint64':
+                set_value = option_value + 1
+            elif option_type == 'bool':
+                set_value = not option_value
+            else:
+                continue
+            ipu_strategy.set_options({option_name: set_value})
+            new_value = ipu_strategy.get_option(option_name)
+            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+    def test_set_string_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {
+            'cache_path': 'paddle_cache',
+            'log_dir': 'paddle_log',
+            'partials_type_matmuls': 'half',
+            'partials_type_matmuls': 'float',
+        }
+        ipu_strategy.set_options(options)
+        for k, v in options.items():
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+    def test_set_other_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {}
+        options['dot_checks'] = ['0', '1', '2', '3']
+        options['engine_options'] = {
+            'debug.allowOutOfMemory': 'true',
+            'autoReport.directory': 'path',
+            'autoReport.all': 'true'
+        }
+        for k, v in options.items():
+            ipu_strategy.set_options({k: v})
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index 196f94b68f9..a52946bba15 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,52 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 1,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -80,33 +82,38 @@ class TestBase(IPUOpTest):
                     out = paddle.fluid.layers.nn.layer_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
                 else:
-                    # scale = True
-                    # bias = True
                     scale = self.attrs['scale']
                     bias = self.attrs['shift']
                     out = paddle.fluid.layers.nn.layer_norm(
                         x, param_attr=scale, bias_attr=bias, **self.attrs)
+                loss = paddle.mean(out)
 
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
+                fetch_list = [loss.name]
 
-            if run_ipu:
+                if self.is_training:
+                    optimizer = None
+                    if self.optimizer == 'sgd':
+                        optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
+                    elif self.optimizer == 'adam':
+                        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+                    elif self.optimizer == 'lamb':
+                        optimizer = paddle.optimizer.Lamb(
+                            learning_rate=1e-2, lamb_weight_decay=0.0)
+                    if optimizer is not None:
+                        optimizer.minimize(loss)
+
+            if exec_mode:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -116,12 +123,14 @@ class TestBase(IPUOpTest):
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=self.feed_fp32,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program,
+                                 feed=self.feed_fp32,
+                                 fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
@@ -137,7 +146,7 @@ class TestBase(IPUOpTest):
 
 @unittest.skip('raise error')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": False,
             "shift": True,
@@ -148,7 +157,7 @@ class TestCase1(TestBase):
 
 @unittest.skip('raise error')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": False,
@@ -158,18 +167,28 @@ class TestCase2(TestBase):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
 
 class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 1,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'sgd'
+
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 1e-6
 
     def set_training(self):
         self.is_training = True
@@ -178,15 +197,34 @@ class TestTrainCase1(TestBase):
 
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 5e-4
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
-            "epsilon": 1e-05,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'adam'
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+class TestTrainCase3(TestBase):
+    def set_atol(self):
+        self.atol = 5e-3
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 2,
+            "epsilon": 1e-05
         }
+        self.optimizer = 'lamb'
 
     def set_training(self):
         self.is_training = True
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index dc3cab6ac5e..fad7516e442 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -16,15 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
 import paddle.nn.functional as F
-
-paddle.enable_static()
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,72 +27,81 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = F.log_softmax(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
new file mode 100644
index 00000000000..3f8472890d0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -0,0 +1,97 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 20, 30528])
+        self.feed = {"in_0": data.astype('bool')}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="bool")
+
+                out = paddle.fluid.layers.logical_not(x)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).astype(np.int32)
+
+        self.check(output_dict, check_shape=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 31b0c99603c..4a877ddce4e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,16 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": data.astype(np.int64)}
+        self.feed_ipu = {"x": data.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "size": [128, 16],
             "is_sparse": False,
@@ -50,33 +53,20 @@ class TestBase(IPUOpTest):
             "dtype": 'float32'
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        if run_ipu:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int32)
-            }
-        else:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int64)
-            }
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        self.set_feed_attr()
-
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='int64')
+
                 out = paddle.fluid.layers.embedding(x, **self.attrs)
 
                 if self.is_training:
@@ -87,47 +77,61 @@ class TestBase(IPUOpTest):
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
     def set_training(self):
         self.is_training = True
         self.epoch = 10
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
new file mode 100644
index 00000000000..da8048fb320
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -0,0 +1,141 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {"x": x.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "num_embeddings": 128,
+            "embedding_dim": 16,
+            "sparse": False,
+            "padding_idx": -1,
+            "weight_attr": None
+        }
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                embedding = paddle.nn.Embedding(**self.attrs)
+                out = embedding(x)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
rename to python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 38b91785aee..58f018e2ae6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -19,7 +19,7 @@ import unittest
 import sys
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 from paddle.optimizer.lr import LRScheduler
 
 paddle.enable_static()
@@ -71,8 +71,8 @@ class TestConvNet(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index c6702b92ab9..6929ded6ebf 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,93 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[20, 30])
+        y = np.random.uniform(size=[30, 20])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": False,
             "transpose_y": False,
             "alpha": 1.0,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.matmul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -119,55 +121,64 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
             "alpha": 3.14,
         }
 
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
 
 class TestCase3(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[5, 4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 3, 2])
+        y = np.random.uniform(size=[5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 3, 2])
+        y = np.random.uniform(size=[4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase5(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase6(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 @unittest.skip("not supported")
 class TestCase6_2(TestCase6):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -176,27 +187,36 @@ class TestCase6_2(TestCase6):
 
 
 class TestCase7(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 1]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 12, 128, 64])
+        y = np.random.uniform(size=[1, 12, 128, 64])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125}
+
+
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 @unittest.skip("not supported")
-class TestCase7_2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[2]).astype('float32'),
-        }
-        # equal to
-        # self.feed = {
-        #     "x": np.random.uniform(size=[3, 1]).astype('float32'),
-        #     "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        # }
+class TestCase8_2(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[2])
 
-    def set_attrs(self):
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -205,12 +225,12 @@ class TestCase7_2(TestBase):
 
 
 @unittest.skip("dim > 4 is not supported")
-class TestCase8(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
-        }
+class TestCase9(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[6, 5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 if __name__ == "__main__":
-- 
GitLab


From 061044a0cc199f03645a9cbc836f46da3930329d Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Tue, 8 Mar 2022 13:59:03 +0800
Subject: [PATCH 178/272] [IPU] update ipu unittests p4 (#40073)

* update ipu UTs part4

* rename uts

* sync api changes

* update uts for new api
---
 .../unittests/ipu/test_set_batch_size_ipu.py  |  96 +++++-----
 .../tests/unittests/ipu/test_sgd_optimizer.py |  88 ---------
 .../tests/unittests/ipu/test_slice_op_ipu.py  | 122 +++++++------
 .../unittests/ipu/test_softmax_op_ipu.py      |  87 ++++-----
 .../tests/unittests/ipu/test_split_op_ipu.py  | 113 ++++++++++++
 .../unittests/ipu/test_squeeze_op_ipu.py      |  91 +++++-----
 .../tests/unittests/ipu/test_stack_op_ipu.py  | 102 ++++++-----
 .../tests/unittests/ipu/test_sum_op_ipu.py    | 143 ++++++++-------
 .../tests/unittests/ipu/test_topk_op_ipu.py   | 171 +++++++++---------
 .../unittests/ipu/test_transpose_op_ipu.py    |  98 +++++-----
 .../unittests/ipu/test_unsqueeze_op_ipu.py    |  86 ++++-----
 ...inplace.py => test_varname_inplace_ipu.py} |  37 ++--
 .../unittests/ipu/test_weight_sharing_ipu.py  | 126 +++++++++++++
 13 files changed, 782 insertions(+), 578 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_varname_inplace.py => test_varname_inplace_ipu.py} (79%)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py

diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 93945b98ef0..9a18922f353 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,36 +26,46 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([-1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=[2, 3, 128, 128]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 128, 128])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 conv2 = paddle.static.nn.conv2d(
@@ -70,36 +75,45 @@ class TestBase(IPUOpTest):
                 conv4 = paddle.static.nn.conv2d(
                     conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+            fetch_list = [conv4.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    batch_size=2, is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                # set batch size
+                ipu_strategy.micro_batch_size = 2
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
deleted file mode 100644
index df0e2a040bd..00000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestSGD(unittest.TestCase):
-    def _test_sgd(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                sgd = paddle.optimizer.SGD(learning_rate=1e-1)
-                sgd.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                loss_res = exe.run(program,
-                                   feed={"image": np_image},
-                                   fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
-
-    def test_sgd(self):
-        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test_sgd(True).flatten()
-        cpu_loss = self._test_sgd(False).flatten()
-
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 3bdfeabce65..8881f018de3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,78 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[4, 5, 6]).astype('float32'), }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[4, 5, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1, 2],
             "starts": [-3, 0, 2],
             "ends": [3, 2, 4],
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.slice(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1],
             "starts": [0, 0],
@@ -113,38 +117,45 @@ class TestCase1(TestBase):
 
 @unittest.skip('dynamic graph is not support on IPU')
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 5, 6]).astype('float32'),
-            "starts": np.array([0, 0, 2]).astype('int32'),
-            "ends": np.array([3, 2, 4]).astype('int32'),
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 5, 6])
+        s = np.array([0, 0, 2])
+        e = np.array([3, 2, 4])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0, 1, 2]}
 
     def _test_base(self, run_ipu=True):
         scope = fluid.core.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
         with fluid.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 starts = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
                 ends = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='int32')
                 out = paddle.fluid.layers.slice(
                     x, starts=starts, ends=ends, **self.attrs)
 
@@ -160,8 +171,8 @@ class TestCase2(TestBase):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -170,6 +181,9 @@ class TestCase2(TestBase):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
+    def test_base(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index a4a4b83baf3..25201959cec 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 20])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.softmax(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
new file mode 100644
index 00000000000..59af3a3d6ac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -0,0 +1,113 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+
+        self.feed_fp32 = {'x': data1.astype(np.float32)}
+        self.feed_fp16 = {'x': data1.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.split(x, **self.attrs)
+
+                fetch_list = [fetch.name for fetch in out]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled
+                ) or mode == ExecutionMode.IPU_POPART_FP16:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [2, 8], "axis": 2}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index ccd27965908..bdc8fb32c84 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,81 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 1, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 1, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.squeeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
-                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": []}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [-2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 3d5de11b5e2..c807ab9aab6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,102 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 2]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2])
+        y = np.random.uniform(size=[1, 2])
+        z = np.random.uniform(size=[1, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": z.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": z.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 003350cd7a0..12351cb63d6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,131 +26,154 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 @unittest.skip('')
 class TestCase1(TestBase):
     def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        z = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": y.astype(np.float16)
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index 9915a7a1fd8..ef75aee7804 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -16,130 +16,125 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestTopKOp(IPUOpTest):
     def setUp(self):
-        self.set_ops()
         self.set_atol()
         self.set_training()
-        self.k = 3
-        self.use_K_as_const_variable = False
-
-        self.set_feed()
-        self.set_attrs()
-
-    def set_ops(self):
-        self.ops = [
-            paddle.fluid.layers.topk,
-            paddle.topk  # use top_k_v2 implementation
-        ]
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([3, 5])
-
-        self.feed = {}
-        self.feed_list = []
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-        self.feed_list.append("in_0")
-        if self.use_K_as_const_variable:
-            # self.feed["in_1"] = np.array([self.k]).astype("int32")
-            # self.feed_list.append("in_1")
-            pass
-        print("[TestTopKop] feed data:\n%s" % self.feed["in_0"])
-
-    def set_attrs(self):
-        self.attrs = {
-            # "axis": -1,
-            # "sorted": True
-        }
-        if not self.use_K_as_const_variable:
-            self.attrs["k"] = self.k
-
-    def _test_base(self, run_ipu=True, op=None, data_feed=None):
-        assert (op is not None)
-        assert (data_feed is not None)
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_test_op()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.topk
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = False
+        self.attrs = {}
+        if not self.use_k_as_const_variable:
+            self.attrs["k"] = 3
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                if not self.use_K_as_const_variable:
-                    topk_values, topk_indices = op(x, **self.attrs)
+
+                if not self.use_k_as_const_variable:
+                    topk_values, topk_indices = self.op(x, **self.attrs)
                 else:
                     # !important, popart cannot accept non const tensor
-                    # K_t = paddle.static.data(name="in_1", shape=[1], dtype='int32')
-                    K_t = fluid.layers.fill_constant(
+                    K_t = paddle.fluid.layers.fill_constant(
                         shape=[1], dtype='int32', value=self.k, name="in_2")
-                    topk_values, topk_indices = op(x, K_t, **self.attrs)
+                    topk_values, topk_indices = self.op(x, K_t, **self.attrs)
+
                 fetch_list = [topk_values.name, topk_indices.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            print("Running inference ...")
-            result = exe.run(program, feed=data_feed, fetch_list=fetch_list)
-            print("Complete running infrence.")
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result
 
     def test_base(self):
-        for op in self.ops:
-            res0_topk_values, res0_topk_indices = self._test_base(
-                True, op=op, data_feed=self.feed)
-            res1_topk_values, res1_topk_indices = self._test_base(
-                False, op=paddle.fluid.layers.topk, data_feed=self.feed)
-
-            print("[TestTopKop] IPU res0 values:\n%s\n" % res0_topk_values)
-            print("[TestTopKop] CPU res1 values:\n%s\n" % res1_topk_values)
-            view_type = np.uint32
-            print("[TestTopKop] IPU res0 indices:\n%s\n" %
-                  res0_topk_indices.astype(view_type))
-            print("[TestTopKop] CPU res1 indices:\n%s\n" % res1_topk_indices)
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_values.flatten(),
-                    res1_topk_values.flatten(),
-                    atol=self.atol))
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_indices.astype(view_type).flatten(),
-                    res1_topk_indices.flatten(),
-                    atol=self.atol))
+        value_dict = {}
+        index_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            value, index = self._test_base(mode)
+            value_dict[mode] = value
+            index_dict[mode] = index
+
+        self.check(value_dict)
+        self.check(index_dict)
+
+
+class TestCase2(TestTopKOp):
+    def set_test_op(self):
+        self.op = paddle.topk
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase3(TestTopKOp):
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = True
+        self.attrs = {}
+        self.k = 2
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase4(TestCase3):
+    def set_test_op(self):
+        self.op = paddle.topk
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 77d2f413101..1747bde20b6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,94 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 2, 3, 1]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.transpose(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 1, 2, 3]}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 3, 4, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3, 4, 5])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [4, 0, 2, 3, 1]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 75ed5a07315..e068c2e3b59 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,79 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[1, 2, 3]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": -1}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [1, 2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
rename to python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
index fabad936dec..5cc62432dc6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
@@ -16,15 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-from paddle.fluid.executor import global_scope
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,11 +26,11 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
         }
@@ -45,25 +38,22 @@ class TestBase(IPUOpTest):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -76,12 +66,13 @@ class TestBase(IPUOpTest):
                 scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5)
                 scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7)
 
-                fetch_list = [scale3.name]
+            fetch_list = [scale3.name]
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
             scale1_out = main_prog.global_block().ops[4].output("Out")[0]
@@ -92,8 +83,8 @@ class TestBase(IPUOpTest):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
new file mode 100644
index 00000000000..ecf1c61f52e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestWeightSharing(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.randint(0, 768, size=(128, 1)).astype(np.int32)
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {
+            "x": np.tile(x.astype(np.int64)[np.newaxis, :], [3, 1, 1])
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    y = paddle.fluid.layers.embedding(
+                        input=x,
+                        size=[768, 768],
+                        dtype='float32',
+                        param_attr=paddle.fluid.ParamAttr(
+                            name='word_embedding'),
+                        is_sparse=False)
+
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    z = paddle.fluid.layers.fc(
+                        input=y,
+                        size=768,
+                        param_attr=paddle.fluid.ParamAttr(name="fc"))
+
+                with paddle.static.ipu_shard_guard(index=0, stage=2):
+                    out = paddle.fluid.layers.matmul(
+                        x=z,
+                        y=main_prog.global_block().var('word_embedding'),
+                        transpose_y=True)
+
+            fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(
+                    num_ipus=2,
+                    is_training=self.is_training,
+                    enable_manual_shard=True)
+                ipu_strategy.set_pipelining_config(
+                    enable_pipelining=True, batches_per_step=3)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_ipu if run_ipu else self.feed_cpu
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1[0].flatten(), atol=self.atol))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 47d1d5af242c49e36520d2fd04abcac2715fe6f4 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 8 Mar 2022 14:31:27 +0800
Subject: [PATCH 179/272] [PHI] Support string type attr in yaml (#40218)

* support str attr in yaml

* fix bug
---
 .../final_state_generator/eager_gen.py             |  4 ++--
 python/paddle/utils/code_gen/api.yaml              |  4 ++--
 python/paddle/utils/code_gen/api_base.py           | 14 ++++++++------
 python/paddle/utils/code_gen/sparse_api.yaml       |  2 +-
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d1e20854153..81d0c9b7bed 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -25,10 +25,10 @@ core_ops_args_type_info = {}
 
 
 yaml_types_mapping = {
-    'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
+    'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
     'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
-    'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
+    'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
     'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 699e42f2373..8c68ca4d7e0 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -121,7 +121,7 @@
   backward : matmul_grad
 
 - api : mean
-  args : (Tensor x, int64_t[] axis={}, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : MeanInferMeta
@@ -181,7 +181,7 @@
     func : subtract
 
 - api : sum
-  args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : SumInferMeta
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 601248a4176..68127fb522c 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -89,18 +89,20 @@ class BaseAPI(object):
         attr_types_map = {
             'ScalarArray': 'const ScalarArray&',
             'Scalar': 'const Scalar&',
+            'uint8': 'uint8_t',
             'int': 'int',
-            'int32_t': 'int32_t',
-            'int64_t': 'int64_t',
+            'int32': 'int32_t',
+            'int64': 'int64_t',
             'long': 'long',
             'size_t': 'size_t',
             'float': 'float',
             'double': 'double',
             'bool': 'bool',
+            'str': 'const std::string&',
             'Backend': 'Backend',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
-            'int64_t[]': 'const std::vector<int64_t>&',
+            'int64[]': 'const std::vector<int64_t>&',
             'int[]': 'const std::vector<int>&',
             'long[]': 'const std::vector<int64_t>&'
         }
@@ -110,8 +112,8 @@ class BaseAPI(object):
             'ScalarArray': 'const paddle::optional<ScalarArray>&',
             'Scalar': 'const paddle::optional<Scalar>&',
             'int': 'paddle::optional<int>',
-            'int32_t': 'paddle::optional<int32_t>',
-            'int64_t': 'paddle::optional<int64_t>',
+            'int32': 'paddle::optional<int32_t>',
+            'int64': 'paddle::optional<int64_t>',
             'size_t': 'paddle::optional<size_t>',
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
@@ -119,7 +121,7 @@ class BaseAPI(object):
             'Backend': 'paddle::optional<Backend>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>',
-            'int64_t[]': 'paddle::optional<std::vector<int64_t>>',
+            'int64[]': 'paddle::optional<std::vector<int64_t>>',
             'int[]': 'paddle::optional<std::vector<int>>'
         }
 
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 135989121cc..b531c2ed9ce 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -11,7 +11,7 @@
   invoke : to_dense_impl(x, backend)
 
 - sparse_api : to_sparse_coo
-  args : (Tensor x, Backend backend, int64_t sparse_dim)
+  args : (Tensor x, Backend backend, int64 sparse_dim)
   output : Tensor(out@SparseCooTensor)
   invoke : to_sparse_coo_impl(x, backend, sparse_dim)
 
-- 
GitLab


From f1fe2ad45d2b4cd013ce83194192b1fb7bc72957 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Tue, 8 Mar 2022 14:33:28 +0800
Subject: [PATCH 180/272] add support for concat and variadic tensor list
 (#40229)

---
 .../paddle/fluid/tests/unittests/op_test.py   | 23 +++++++++++--------
 .../fluid/tests/unittests/test_concat_op.py   |  1 +
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 0c7f269a087..6455da92475 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -722,13 +722,17 @@ class OpTest(unittest.TestCase):
 
         def assumption_assert_and_transform(args, argvs):
             """
-            currently only support "X" is [Tensor], don't support multi-tensor in "X"
+            transform by the following rules:
+                1. [Tensor] -> Tensor
+                2. [Tensor, Tensor, ...] -> list of Tensors
+
+            only support "X" is list of Tensor, currently don't support other structure like dict.
             """
             for inp in args:
-                assert isinstance(inp, list) and len(
-                    inp
-                ) == 1, "currently only support `X` is [Tensor], don't support multi-tensor in `X`"
-            args = [inp[0] for inp in args]
+                assert isinstance(
+                    inp, list
+                ), "currently only support `X` is [Tensor], don't support other structure."
+            args = [inp[0] if len(inp) == 1 else inp for inp in args]
             return args, argvs
 
         def cal_python_api(python_api, args, argvs, kernel_sig):
@@ -1239,15 +1243,16 @@ class OpTest(unittest.TestCase):
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
 
+        if check_eager:
+            with _test_eager_guard():
+                eager_dygraph_outs = self._calc_dygraph_output(
+                    place, no_check_set=no_check_set)
+            # we only check end2end api when check_eager=True
             if hasattr(self, "python_api"):
                 api_outs = self._calc_python_api_output(place)
                 self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
                                                      place)
 
-        if check_eager:
-            with _test_eager_guard():
-                eager_dygraph_outs = self._calc_dygraph_output(
-                    place, no_check_set=no_check_set)
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 10b7e13dcc3..4feca1b9250 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -25,6 +25,7 @@ import paddle
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
+        self.python_api = paddle.concat
         self.dtype = self.get_dtype()
         self.init_test_data()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
-- 
GitLab


From 975f99ab012310e97cbdee44bb25a05ad7bad012 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 8 Mar 2022 14:54:30 +0800
Subject: [PATCH 181/272] [Phi]Move
 Relu/Cos/Sin/Tan/Acos/Asin/Atan/Sinh/Cosh/Asinh/Acosh/Atanh kernels in
 Activation to Phi (#40175)

* move activation op

* adjust code format

* fix compile bugs

* fix ci bugs

* code format adjust

* code format adjust2

* activate ci status

* modify according to comment
---
 cmake/operators.cmake                         |   2 +-
 .../ir/mkldnn/mkldnn_inplace_pass_tester.cc   |   2 +-
 .../paddle2cinn/build_cinn_pass_test.cc       |   4 +-
 .../paddle2cinn/cinn_compiler_test.cc         |   2 +-
 .../fluid/imperative/tests/test_prepare_op.cc |   2 +-
 .../tensorrt/convert/test_activation_op.cc    |   2 +-
 .../fluid/operators/activation_cudnn_op.cu.cc |  19 +-
 paddle/fluid/operators/activation_op.cc       |  43 +-
 paddle/fluid/operators/activation_op.h        | 590 +++----------
 paddle/fluid/operators/activation_op.kps      | 454 ++--------
 .../operators/mkldnn/test_mkldnn_caching.cc   |   2 +-
 .../mkldnn/test_mkldnn_op_inplace.cc          |   2 +-
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   |   2 +-
 .../operators/mlu/activation_op_mlu_test.cc   |   2 +-
 .../test_common_infer_shape_functions.cc      |   2 +-
 paddle/phi/kernels/activation_grad_kernel.h   |  55 ++
 paddle/phi/kernels/activation_kernel.h        |  40 +
 .../phi/kernels/cpu/activation_grad_kernel.cc |  91 ++
 paddle/phi/kernels/cpu/activation_kernel.cc   |  55 ++
 paddle/phi/kernels/funcs/activation_functor.h | 830 ++++++++++++++++++
 .../phi/kernels/gpu/activation_grad_kernel.cu | 221 +++++
 paddle/phi/kernels/gpu/activation_kernel.cu   | 143 +++
 .../phi/kernels/impl/activation_grad_impl.h   | 133 +++
 paddle/phi/kernels/impl/activation_impl.h     |  50 ++
 paddle/phi/ops/compat/activation_sig.cc       |  67 ++
 25 files changed, 1908 insertions(+), 907 deletions(-)
 create mode 100644 paddle/phi/kernels/activation_grad_kernel.h
 create mode 100644 paddle/phi/kernels/activation_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/activation_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/activation_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/activation_functor.h
 create mode 100644 paddle/phi/kernels/gpu/activation_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/activation_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/activation_grad_impl.h
 create mode 100644 paddle/phi/kernels/impl/activation_impl.h
 create mode 100644 paddle/phi/ops/compat/activation_sig.cc

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 9e8c81c2985..1291e60cfe4 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -478,7 +478,7 @@ function(op_library TARGET)
     if (${pybind_flag} EQUAL 0)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+        file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       elseif(${TARGET} STREQUAL "fake_quantize")
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 0a95444f852..796aa4039c9 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -27,7 +27,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
 USE_OP(gelu);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index bf9d1baaf39..47dffd47b7c 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 
 USE_PASS(build_cinn_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(relu_grad);
+USE_OP_ITSELF(relu_grad);
 USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index e8badab27b9..cdccc4c5546 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) {
 USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index f5ca13cb99a..17cbe067482 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -226,7 +226,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
 }  // namespace paddle
 
 USE_OP_ITSELF(split);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 #ifdef PADDLE_WITH_MKLDNN
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 #endif
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index f2dc5ba1c7c..7f7313fbcb5 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 0ac29e6d3ad..b4a97e24cf2 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
       : CudnnActivationGradFunctor<T>(ctx, 6.0,
                                       GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename Functor>
@@ -197,7 +205,8 @@ class CudnnActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
+    static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut,
+                  "Forward deps must be Out.");
 
     const framework::Tensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 73d65b7c6e7..66f1bcc8b68 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -34,7 +34,8 @@ using paddle::framework::Tensor;
 
 template <typename GradFunctor>
 static constexpr bool CanInplaceAct() {
-  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
+  return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut ||
+         GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps;
 }
 
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
@@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DOut")) {
         ctx->ShareDim("Out", "DOut");
         ctx->ShareLoD("Out", "DOut");
@@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("X", "DDOut");
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("Out", "DDOut");
         ctx->ShareLoD("Out", "DDOut");
@@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("D_DOut")) {
         ctx->ShareDim("Out", "D_DOut");
         ctx->ShareLoD("Out", "D_DOut");
@@ -1464,6 +1471,18 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor)
+REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor);
+REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor);
+REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor);
+REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor);
+REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor);
+REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor);
+REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
+REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
+REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
+REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+
 /* ==========================    sigmoid register  =============================
  */
 // 1. Register Sigmoid Operator
@@ -1584,16 +1603,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::ReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== leaky relu register  ============================ */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ff41da86f7b..4b79397b6cd 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -35,16 +35,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
 using framework::To32BitIndex;
 
-enum ActBwdOpFwdDeps {
-  kNoDeps = 0x00,  // Do not need any forward input/output
-  kDepX = 0x01,    // Only need forward input X
-  kDepOut = 0x02,  // Only need forward output Out
-};
+using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps;
 
 /* The following operator can be used to process SelectedRows, because the
  * output of those operator for zero is zero too.
@@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor(
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
   const framework::Variable* out_var = nullptr;
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     out_var = context.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var, platform::errors::NotFound(
@@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor(
                               "Output(Out), variable name = %s",
                               context.OutputName(framework::GradVarName("X"))));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound(
                                        "Cannot get the tensor from the "
@@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+#define USE_PHI_FUNCTOR(name)                         \
+  template <typename T>                               \
+  using name##Functor = phi::funcs::name##Functor<T>; \
+  template <typename T>                               \
+  using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
+
+USE_PHI_FUNCTOR(Cos)
+USE_PHI_FUNCTOR(Tan)
+USE_PHI_FUNCTOR(Acos)
+USE_PHI_FUNCTOR(Sin)
+USE_PHI_FUNCTOR(Asin)
+USE_PHI_FUNCTOR(Atan)
+USE_PHI_FUNCTOR(Sinh)
+USE_PHI_FUNCTOR(Cosh)
+USE_PHI_FUNCTOR(Asinh)
+USE_PHI_FUNCTOR(Acosh)
+USE_PHI_FUNCTOR(Atanh)
+
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
@@ -256,7 +273,9 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
           (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // silu(x) = x / (1 + exp(-x))
@@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
                            (static_cast<T>(1) + (temp2 / temp1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
@@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
         dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // exp(x) = e^x
@@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // expm1(x) = e^x - 1
@@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // relu(x) = max(x, 0)
-template <typename T>
-struct ReluCPUFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
-      return v > static_cast<T>(0) ? v : static_cast<T>(0);
-    });
-  }
-};
 
 template <typename T>
-struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0));
-  }
-};
+using ReluCPUFunctor = phi::funcs::ReluCPUFunctor<T>;
+template <typename T>
+using ReluGradFunctor = phi::funcs::ReluGradFunctor<T>;
 
 template <typename T>
-struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
-  }
+using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
+template <typename T>
+using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 /*
     Out
@@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
                          static_cast<T>(2) * out * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
@@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // sqrt(x) = x^(1/2)
@@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // rsqrt(x) = x^(-1/2)
@@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // ceil(x) = ceiling(x)
@@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0) * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
+  }
 };
 
 // floor(x) = flooring(x)
@@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct Sine {
-  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
-};
-
-template <>
-struct Sine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sin(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosine {
-  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
-};
-
-template <>
-struct Cosine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(cos(static_cast<float>(val)));
-  }
-};
-
-// cosine'(x) = -sin(x)
-template <typename T>
-struct CosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosine(x) = cos(x)
-template <typename T>
-struct CosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosine<T>());
-  }
-};
-
-// sine'(x) = cos(x)
-template <typename T>
-struct SinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// sine(x) = sin(x)
-template <typename T>
-struct SinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sine<T>());
-  }
-};
-
-template <typename T>
-struct Tangent {
-  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
-};
-
-template <>
-struct Tangent<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(tan(static_cast<float>(val)));
-  }
-};
-
-// Tangent'(x) = -Tangent(x)
-template <typename T>
-struct TanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// Tangent(x) = tan(x)
-template <typename T>
-struct TanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Tangent<T>());
-  }
-};
-
-template <typename T>
-struct Sinh {
-  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
-};
-
-template <>
-struct Sinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sinhf(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosh {
-  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
-};
-
-template <>
-struct Cosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(coshf(static_cast<float>(val)));
-  }
-};
-
-// sinh(x) = sinh(x)
-template <typename T>
-struct SinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sinh<T>());
-  }
-};
-
-// cosh(x) = cosh(x)
-template <typename T>
-struct CoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosh<T>());
-  }
-};
-
-// sinh'(x) = cosh(x)
-template <typename T>
-struct SinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosh'(x) = sinh(x)
-template <typename T>
-struct CoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acos {
-  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
-};
-
-template <>
-struct Acos<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acos(static_cast<float>(val)));
-  }
-};
-
-// Acos(x) = acos(x)
-template <typename T>
-struct AcosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acos<T>());
-  }
-};
-
-// acos'(x) = -1/sqrt(1-x^2)
-template <typename T>
-struct AcosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asin {
-  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
-};
-
-template <>
-struct Asin<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asin(static_cast<float>(val)));
-  }
-};
-
-// Asin(x) = asin(x)
-template <typename T>
-struct AsinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asin<T>());
-  }
-};
-
-// asin'(x) = 1/sqrt(1-x^2)
-template <typename T>
-struct AsinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atan {
-  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
-};
-
-template <>
-struct Atan<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atan(static_cast<float>(val)));
-  }
-};
-
-// Atan(x) = atan(x)
-template <typename T>
-struct AtanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atan<T>());
-  }
-};
-
-// atan'(x) =  1 / (1 + x^2)
-template <typename T>
-struct AtanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acosh {
-  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
-};
-
-template <>
-struct Acosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acosh(static_cast<float>(val)));
-  }
-};
-
-// Acosh(x) = acosh(x)
-template <typename T>
-struct AcoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acosh<T>());
-  }
-};
-
-// acosh'(x) =  1/sqrt(x^2 - 1)
-template <typename T>
-struct AcoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asinh {
-  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
-};
-
-template <>
-struct Asinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asinh(static_cast<float>(val)));
-  }
-};
-
-// Asinh(x) = asinh(x)
-template <typename T>
-struct AsinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asinh<T>());
-  }
-};
-
-// asinh'(x) =  1/sqrt(x^2 + 1)
-template <typename T>
-struct AsinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atanh {
-  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
-};
-
-template <>
-struct Atanh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atanh(static_cast<float>(val)));
-  }
-};
-
-// Atanh(x) = atanh(x)
-template <typename T>
-struct AtanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atanh<T>());
-  }
-};
-
-// atanh'(x) =  1/(1 - x^2)
-template <typename T>
-struct AtanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // log(x) = natural logarithm of x
@@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log2(x) = logarithm to the base 2 of the elements of x
@@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log10(x) = logarithm to the base 10 of the elements of x
@@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log1p(x) = natural logarithm of x+1
@@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // square(x) = x^2
@@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(2) * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
                        .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // relu6(x) = min(max(0, x), 6)
@@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
             .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // HardSwish = min(max(0, x+3), 6) * x / 6
@@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
          static_cast<T>(1) * (static_cast<T>(1) - tmp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // For numerical stability, using the following formula instead of softplus(x) =
@@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
             .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // mish(x) = x * tanh(softplus(x))
@@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softsign(x) = x / (1 + |x|)
@@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
         dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * (out + static_cast<T>(alpha)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * static_cast<T>(alpha) * x.exp());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor<T> {
         dout * (x / static_cast<T>(alpha)).exp() * temp_a_neg * temp_x_neg;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
@@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
                    x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x > th).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
                    static_cast<T>(slope);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 /*
@@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor(
           "Cannot get the tensor from the Variable Output, variable name = %s",
           ctx.OutputName("DDX")));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = ctx.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(
         x_var, platform::errors::NotFound(
@@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor(
     VLOG(10) << "Inplace activation of Op: " << ctx.Type();
     *X = *ddX;
   }
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     auto out_var = ctx.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var,
@@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * x.sign();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
-      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
               .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(2) * x;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
@@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
     }
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 }  // namespace operators
@@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
   __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
-  __macro(tan, Tan, TanFunctor, TanGradFunctor);                              \
-  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
-  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
-  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
-  __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor);                          \
-  __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
-  __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);                      \
-  __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);                      \
-  __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);                      \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 3b7ce9eaf2b..208abd0949a 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -18,28 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
 template <typename T>
 struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
     return x > zero ? dout : static_cast<T>(alpha) * dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return dout * out * (one - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x >= -l && x <= l) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
-};
-
-template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
-  }
-};
-
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
-  }
-};
-
-template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
-  }
-};
-
-template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
-  }
-};
-
-template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
-  }
-};
-
-template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
-  }
-};
-
-template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -469,88 +244,11 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
     return dout * (one - out * out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
 };
 
-template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
@@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
     return -dout * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
     return dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
     return dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
     return dout / x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
     return dout * two * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
     return one_half * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
     return minus_one_half * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
     return dout / (one + x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_two);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_ten);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
     return (x > t_min_cast && x < t_max_cast) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
                                  : static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * a * b * (one - temp * temp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
     return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
     return dout / (temp * temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < t) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x > -t && x < t) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 + temp3));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     return x > static_cast<T>(threshold) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
     return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
          temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename Functor>
@@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel
     std::vector<const framework::Tensor*> ins = {d_out};
     std::vector<framework::Tensor*> outs = {d_x};
 
-    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+    if (static_cast<int>(Functor::FwdDeps()) ==
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
                                                                 &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
-               static_cast<int>(kDepX)) {
+               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       // Only need forward input X
       ins.push_back(x);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
@@ -1602,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    relu register  ============================ */
-#ifdef PADDLE_WITH_HIP
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
-                                CudaReluGradFunctor);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                    ops::CudaReluFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaReluFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::float16>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                             ops::CudaReluGradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::float16>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::bfloat16>>);
-#endif
-/* ========================================================================== */
-
 /* ===========================    sigmoid register  ============================
  */
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
@@ -1838,21 +1511,10 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
   __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
           CudaLogSigmoidGradFunctor);                                         \
-  __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor);                  \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
   __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor);               \
-  __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);                      \
-  __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);                      \
-  __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor);                  \
-  __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);                      \
-  __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor);                  \
-  __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor);                  \
-  __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor);                  \
-  __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor);              \
-  __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor);              \
-  __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor);              \
   __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
   __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
           CudaReciprocalGradFunctor);                                         \
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 05cd264cf3e..23428dd403e 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index c776cf2a7c7..e9dadd5ec93 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -27,7 +27,7 @@
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 3791fed23a8..916f02179b3 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -27,7 +27,7 @@
 
 USE_OP(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index 88452130175..6e3bd5e43c9 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MLU);
 
 // relu
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index a7c7e33f58a..1de1b590a13 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/phi/core/ddim.h"
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(softmax);
 
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
new file mode 100644
index 00000000000..f34e5710ab7
--- /dev/null
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+  template <typename T, typename Context>         \
+  void name##GradKernel(const Context& dev_ctx,   \
+                        const DenseTensor& x,     \
+                        const DenseTensor& dout,  \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+  template <typename T, typename Context>           \
+  void name##GradKernel(const Context& dev_ctx,     \
+                        const DenseTensor& out,     \
+                        const DenseTensor& dout,    \
+                        DenseTensor* dx);
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
new file mode 100644
index 00000000000..bdf8f436359
--- /dev/null
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_KERNEL(name)   \
+  template <typename T, typename Context> \
+  void name##Kernel(                      \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+DECLARE_ACTIVATION_KERNEL(Cos)
+DECLARE_ACTIVATION_KERNEL(Tan)
+DECLARE_ACTIVATION_KERNEL(Acos)
+DECLARE_ACTIVATION_KERNEL(Sin)
+DECLARE_ACTIVATION_KERNEL(Asin)
+DECLARE_ACTIVATION_KERNEL(Atan)
+DECLARE_ACTIVATION_KERNEL(Sinh)
+DECLARE_ACTIVATION_KERNEL(Cosh)
+DECLARE_ACTIVATION_KERNEL(Asinh)
+DECLARE_ACTIVATION_KERNEL(Acosh)
+DECLARE_ACTIVATION_KERNEL(Atanh)
+DECLARE_ACTIVATION_KERNEL(Relu)
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
new file mode 100644
index 00000000000..fe43ebb8160
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class functor;                                          \
+    ActivationGradImpl<T, Context, functor_class>(                  \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class functor;                                            \
+    ActivationGradImpl<T, Context, functor_class>(                    \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
new file mode 100644
index 00000000000..51883f25183
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
+  template <typename T, typename Context>                                \
+  void name##Kernel(                                                     \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
+    functor_class functor;                                               \
+    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+
+}  // namespace phi
+PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
+PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
+PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
+PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
+PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
+PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
+PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
+PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
+PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
+PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
+PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
+PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
new file mode 100644
index 00000000000..1a36e4e132f
--- /dev/null
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -0,0 +1,830 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include <type_traits>
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+namespace funcs {
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+
+template <>
+struct Sine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sin(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+
+template <>
+struct Cosine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(cos(static_cast<float>(val)));
+  }
+};
+
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
+
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+
+template <typename T>
+struct Tangent {
+  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
+};
+
+template <>
+struct Tangent<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(tan(static_cast<float>(val)));
+  }
+};
+
+// Tangent'(x) = -Tangent(x)
+template <typename T>
+struct TanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// Tangent(x) = tan(x)
+template <typename T>
+struct TanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Tangent<T>());
+  }
+};
+
+template <typename T>
+struct Sinh {
+  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
+};
+
+template <>
+struct Sinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sinhf(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosh {
+  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
+};
+
+template <>
+struct Cosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(coshf(static_cast<float>(val)));
+  }
+};
+
+// sinh(x) = sinh(x)
+template <typename T>
+struct SinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sinh<T>());
+  }
+};
+
+// cosh(x) = cosh(x)
+template <typename T>
+struct CoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosh<T>());
+  }
+};
+
+// sinh'(x) = cosh(x)
+template <typename T>
+struct SinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosh'(x) = sinh(x)
+template <typename T>
+struct CoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acosh {
+  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
+};
+
+template <>
+struct Acosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acosh(static_cast<float>(val)));
+  }
+};
+
+// Acosh(x) = acosh(x)
+template <typename T>
+struct AcoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acosh<T>());
+  }
+};
+
+// acosh'(x) =  1/sqrt(x^2 - 1)
+template <typename T>
+struct AcoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asinh {
+  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
+};
+
+template <>
+struct Asinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asinh(static_cast<float>(val)));
+  }
+};
+
+// Asinh(x) = asinh(x)
+template <typename T>
+struct AsinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asinh<T>());
+  }
+};
+
+// asinh'(x) =  1/sqrt(x^2 + 1)
+template <typename T>
+struct AsinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atanh {
+  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
+};
+
+template <>
+struct Atanh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atanh(static_cast<float>(val)));
+  }
+};
+
+// Atanh(x) = atanh(x)
+template <typename T>
+struct AtanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atanh<T>());
+  }
+};
+
+// atanh'(x) =  1/(1 - x^2)
+template <typename T>
+struct AtanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluCPUFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
+      return v > static_cast<T>(0) ? v : static_cast<T>(0);
+    });
+  }
+};
+
+template <typename T>
+struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
+      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / (one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
new file mode 100644
index 00000000000..c2995c79a7e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -0,0 +1,221 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradGPUImpl(const Context& dev_ctx,
+                           const DenseTensor* x,
+                           const DenseTensor* out,
+                           const DenseTensor* d_out,
+                           DenseTensor* d_x,
+                           const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      d_x, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!out) {
+    out = d_out;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        x, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    x = d_x;
+  }
+
+  dev_ctx.template Alloc<T>(d_x);
+
+  std::vector<const DenseTensor*> ins = {d_out};
+  std::vector<DenseTensor*> outs = {d_x};
+
+  if (static_cast<int>(Functor::FwdDeps()) ==
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    // Only need forward output Out
+    ins.push_back(out);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else if (static_cast<int>(Functor::FwdDeps()) ==
+             static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    // Only need forward input X
+    ins.push_back(x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class functor;                                          \
+    ActivationGradGPUImpl<T, Context, functor_class>(               \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class functor;                                            \
+    ActivationGradGPUImpl<T, Context, functor_class>(                 \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
+
+}  // namespace phi
+PD_REGISTER_KERNEL(cos_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CosGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(tan_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TanGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acos_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcosGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atan_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sinh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(cosh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoshGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asinh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acosh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcoshGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atanh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
new file mode 100644
index 00000000000..26752b89e7c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -0,0 +1,143 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGPUImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out,
+                       const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(                                                        \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
+    functor_class functor;                                                  \
+    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+PD_REGISTER_KERNEL(
+    sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acos,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcosKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asin,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sinh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(cosh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoshKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asinh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acosh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcoshKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atanh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
new file mode 100644
index 00000000000..80e23d2b8e2
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradImpl(const Context& dev_ctx,
+                        const DenseTensor* X,
+                        const DenseTensor* Out,
+                        const DenseTensor* dOut,
+                        DenseTensor* dX,
+                        const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      dOut, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      dX, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!Out) {
+    Out = dOut;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = dX;
+  }
+
+  dev_ctx.template Alloc<T>(dX);
+  auto dout = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad"));
+  auto dx = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad"));
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place,
+            To32BitIndex(x),
+            To32BitIndex(out),
+            To32BitIndex(dout),
+            To32BitIndex(dx));
+  } else {
+    functor(*place, x, out, dout, dx);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+void ActivationDoubleGradImpl(const Context& dev_ctx,
+                              const DenseTensor* X,
+                              const DenseTensor* Out,
+                              const DenseTensor* ddX,
+                              DenseTensor* dX,
+                              DenseTensor* dOut,
+                              DenseTensor* ddOut,
+                              const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = ddX;
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    Out = ddX;
+  }
+
+  if (ddOut) {
+    dev_ctx.template Alloc<T>(ddOut);
+  }
+  if (dOut) {
+    dev_ctx.template Alloc<T>(dOut);
+  }
+  if (dX) {
+    dX->Resize(Out->dims());
+    dev_ctx.template Alloc<T>(dX);
+  }
+
+  functor(dev_ctx, X, Out, ddX, ddOut, dOut, dX);
+}
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout) {
+  funcs::ReluGradGradFunctor<T> relu_double_grad_functor;
+  ActivationDoubleGradImpl<T, Context, funcs::ReluGradGradFunctor<T>>(
+      dev_ctx,
+      nullptr,
+      &out,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      relu_double_grad_functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
new file mode 100644
index 00000000000..ca3debd394a
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+#define ToString(x) #x
+
+template <typename T, typename Context, typename Functor>
+void ActivationImpl(const Context& dev_ctx,
+                    const DenseTensor& X,
+                    DenseTensor* Out,
+                    const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(Out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(Out);
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&X, "Input", "X", "Activation"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Output", "Out", "Activation"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place, To32BitIndex(x), To32BitIndex(out));
+  } else {
+    functor(*place, x, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
new file mode 100644
index 00000000000..396830ca207
--- /dev/null
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+#define DefineActGradDepXOpArgMap(func_name, op_name)                        \
+  KernelSignature func_name##GradOpArgumentMapping(                          \
+      const ArgumentMappingContext& ctx) {                                   \
+    return KernelSignature(                                                  \
+        op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
+  }
+
+#define DefineActGradDepOutOpArgMap(func_name, op_name)                        \
+  KernelSignature func_name##GradOpArgumentMapping(                            \
+      const ArgumentMappingContext& ctx) {                                     \
+    return KernelSignature(                                                    \
+        op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
+  }
+
+KernelSignature ReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
+}
+
+DefineActGradDepXOpArgMap(Cos, "cos");
+DefineActGradDepXOpArgMap(Tan, "tan");
+DefineActGradDepXOpArgMap(Acos, "acos");
+DefineActGradDepXOpArgMap(Sin, "sin");
+DefineActGradDepXOpArgMap(Asin, "asin");
+DefineActGradDepXOpArgMap(Atan, "atan");
+DefineActGradDepXOpArgMap(Sinh, "sinh");
+DefineActGradDepXOpArgMap(Cosh, "cosh");
+DefineActGradDepXOpArgMap(Asinh, "asinh");
+DefineActGradDepXOpArgMap(Acosh, "acosh");
+DefineActGradDepXOpArgMap(Atanh, "atanh");
+DefineActGradDepOutOpArgMap(Relu, "relu");
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
+                           phi::ReluDoubleGradOpArgumentMapping);
-- 
GitLab


From 7024ade70597962aad8e7f7cf77b174fa821ee13 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 8 Mar 2022 15:54:32 +0800
Subject: [PATCH 182/272] [Phi] Move matrix inverse into phi (#40237)

* move matrix inverse into phi

* change license year
---
 paddle/fluid/operators/determinant_op.h       |   6 +-
 paddle/fluid/operators/inverse_op.h           |   4 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   1 -
 paddle/fluid/operators/math/matrix_inverse.cc |  38 -----
 .../fluid/operators/math/matrix_inverse.cu.cc | 124 ---------------
 paddle/fluid/operators/matrix_power_op.h      |   6 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |   1 +
 paddle/phi/kernels/funcs/matrix_inverse.cc    |  37 +++++
 paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 141 ++++++++++++++++++
 .../kernels/funcs}/matrix_inverse.h           |  41 ++---
 10 files changed, 208 insertions(+), 191 deletions(-)
 delete mode 100644 paddle/fluid/operators/math/matrix_inverse.cc
 delete mode 100644 paddle/fluid/operators/math/matrix_inverse.cu.cc
 create mode 100644 paddle/phi/kernels/funcs/matrix_inverse.cc
 create mode 100644 paddle/phi/kernels/funcs/matrix_inverse.cu.cc
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/matrix_inverse.h (61%)

diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 375ef4344f4..463a707ccf1 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,11 +19,11 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace paddle {
 namespace operators {
@@ -226,7 +226,7 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
@@ -381,7 +381,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h
index 1e061d8b50a..31c22915ec5 100644
--- a/paddle/fluid/operators/inverse_op.h
+++ b/paddle/fluid/operators/inverse_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *input, output);
   }
 };
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index bce927c32dd..d5a86d62b41 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -46,7 +46,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(matrix_inverse)
 math_library(segment_pooling)
 math_library(matrix_solve)
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
deleted file mode 100644
index 1b36e615c68..00000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "Eigen/Core"
-#include "Eigen/LU"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
-  }
-};
-
-template class MatrixInverseFunctor<platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
deleted file mode 100644
index 41335a69417..00000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class MatrixInverseFunctor;
-
-template <typename T>
-class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-#ifndef PADDLE_WITH_HIP
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    memory::allocation::AllocationPtr tmp_gpu_mat_data;
-    const T* gpu_mat = a.data<T>();
-    if (n >= 32) {
-      // Copy all elements of input matrix A to a temporary memory space to
-      // avoid being overriden by getrf.
-      tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
-      memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(),
-                   context.GetPlace(), a.data(), a.numel() * sizeof(T),
-                   context.stream());
-      gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
-    }
-
-    std::vector<const T*> cpu_ptrs(batch_size * 2);
-    for (int i = 0; i < batch_size; ++i) {
-      cpu_ptrs[i] = gpu_mat + i * n * n;
-      cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
-    }
-
-    // Copy the addresses of A and A_inv from host to device.
-    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                 platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                 cpu_ptrs.size() * sizeof(T*), context.stream());
-    T** gpu_inv_ptrs =
-        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-    // Allocate device memory for info and pivots.
-    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-    memory::allocation::AllocationPtr tmp_gpu_info_data =
-        memory::Alloc(context, num_ints * sizeof(int));
-    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-
-    std::vector<int> info;  // only for singular checking
-    info.resize(batch_size);
-    // This functions in cuBLAS is intended to be used for matrices of small
-    // sizes where the launch overhead is a significant factor.
-    // TODO(Xreki): call function in cusolver for large matrices.
-    if (n < 32) {
-      // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
-      // plus cublas<S/D>getriBatched.
-      // However it only works if N is less than 32. If not, we need to
-      // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
-      blas.BatchedMatInv(n,
-                         reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                         gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    } else {
-      // This function performs the LU factorization of each matrix A by the
-      // equation P * A = L * U. L and U are written back to original matrix A,
-      // and diagonal elements of L are discarded.
-      int* gpu_pivot_ptr =
-          reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
-      blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_info_ptr, batch_size);
-
-      blas.BatchedGETRI(n,
-                        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    }
-    memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(),
-                 gpu_info_ptr, sizeof(int) * batch_size, context.stream());
-    for (int i = 0; i < batch_size; ++i) {
-      PADDLE_ENFORCE_EQ(info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U. "
-                            "Please check the matrix value and change it to a "
-                            "non-singular matrix",
-                            i, info[i], info[i]));
-    }
-#else
-    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
-#endif
-  }
-};
-
-template class MatrixInverseFunctor<platform::CUDADeviceContext, float>;
-template class MatrixInverseFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
index d2c67d80b4f..8eb9c58513d 100644
--- a/paddle/fluid/operators/matrix_power_op.h
+++ b/paddle/fluid/operators/matrix_power_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace paddle {
 namespace operators {
@@ -67,7 +67,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
     framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
   } else {
     // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *X, &new_x);
     new_n = -n;
   }
@@ -200,7 +200,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
     framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
   } else {
     // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *X, &new_x);
     new_n = -n;
   }
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 02cba6009c4..f0fbb7bf084 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -9,3 +9,4 @@ math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 math_library(concat_and_split_functor DEPS dense_tensor)
 math_library(matrix_reduce DEPS dense_tensor)
+math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
new file mode 100644
index 00000000000..c95e97f8ea8
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+}
+
+template class MatrixInverseFunctor<CPUContext, float>;
+template class MatrixInverseFunctor<CPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
new file mode 100644
index 00000000000..686b8405bf7
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data;
+  const T* gpu_mat = a.data<T>();
+  if (n >= 32) {
+    // Copy all elements of input matrix A to a temporary memory space to
+    // avoid being overriden by getrf.
+    tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T));
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_mat_data->ptr(),
+                         dev_ctx.GetPlace(),
+                         a.data(),
+                         a.numel() * sizeof(T),
+                         dev_ctx.stream());
+    gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
+  }
+
+  std::vector<const T*> cpu_ptrs(batch_size * 2);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_ptrs[i] = gpu_mat + i * n * n;
+    cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
+  }
+
+  // Copy the addresses of A and A_inv from host to device.
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_gpu_ptrs_data->ptr(),
+                       phi::CPUPlace(),
+                       static_cast<void*>(cpu_ptrs.data()),
+                       cpu_ptrs.size() * sizeof(T*),
+                       dev_ctx.stream());
+  T** gpu_inv_ptrs =
+      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+  // Allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
+      paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int));
+  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  std::vector<int> info;  // only for singular checking
+  info.resize(batch_size);
+  // This functions in cuBLAS is intended to be used for matrices of small
+  // sizes where the launch overhead is a significant factor.
+  // TODO(Xreki): call function in cusolver for large matrices.
+  if (n < 32) {
+    // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
+    // plus cublas<S/D>getriBatched.
+    // However it only works if N is less than 32. If not, we need to
+    // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
+    blas.BatchedMatInv(n,
+                       reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                       gpu_inv_ptrs,
+                       gpu_info_ptr,
+                       batch_size);
+  } else {
+    // This function performs the LU factorization of each matrix A by the
+    // equation P * A = L * U. L and U are written back to original matrix A,
+    // and diagonal elements of L are discarded.
+    int* gpu_pivot_ptr =
+        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    blas.BatchedGETRF(n,
+                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_info_ptr,
+                      batch_size);
+
+    blas.BatchedGETRI(n,
+                      reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_inv_ptrs,
+                      gpu_info_ptr,
+                      batch_size);
+  }
+  paddle::memory::Copy(phi::CPUPlace(),
+                       info.data(),
+                       dev_ctx.GetPlace(),
+                       gpu_info_ptr,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  for (int i = 0; i < batch_size; ++i) {
+    PADDLE_ENFORCE_EQ(info[i],
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U. "
+                          "Please check the matrix value and change it to a "
+                          "non-singular matrix",
+                          i,
+                          info[i],
+                          info[i]));
+  }
+#else
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+#endif
+}
+
+template class MatrixInverseFunctor<GPUContext, float>;
+template class MatrixInverseFunctor<GPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
+                                    double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
similarity index 61%
rename from paddle/fluid/operators/math/matrix_inverse.h
rename to paddle/phi/kernels/funcs/matrix_inverse.h
index fb58b483666..c5b04a81065 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,17 +17,18 @@ limitations under the License. */
 #include <string>
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
-template <typename DeviceContext, typename T>
-void compute_inverse_eigen(const DeviceContext& context,
-                           const framework::Tensor& a,
-                           framework::Tensor* a_inv) {
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void ComputeInverseEigen(const Context& dev_ctx,
+                         const DenseTensor& a,
+                         DenseTensor* a_inv) {
   using Matrix =
       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using EigenMatrixMap = Eigen::Map<Matrix>;
@@ -38,7 +39,7 @@ void compute_inverse_eigen(const DeviceContext& context,
   int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
 
   const T* a_ptr = a.data<T>();
-  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+  T* a_inv_ptr = a_inv->mutable_data<T>(dev_ctx.GetPlace());
 
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
@@ -47,20 +48,20 @@ void compute_inverse_eigen(const DeviceContext& context,
     lu.compute(mat);
 
     const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-    PADDLE_ENFORCE_GT(
-        min_abs_pivot, static_cast<T>(0),
-        platform::errors::InvalidArgument("Input is not invertible."));
+    PADDLE_ENFORCE_GT(min_abs_pivot,
+                      static_cast<T>(0),
+                      errors::InvalidArgument("Input is not invertible."));
     mat_inv.noalias() = lu.inverse();
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename Context, typename T>
 class MatrixInverseFunctor {
  public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  framework::Tensor* a_inv);
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& a,
+                  DenseTensor* a_inv);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
-- 
GitLab


From 73583f862b7ac88328b201e5ac8d22bc4c122078 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Tue, 8 Mar 2022 16:04:05 +0800
Subject: [PATCH 183/272] add the implementation of process group for hccl
 (#40228)

* add pg_hccl
---
 .../distributed/collective/CMakeLists.txt     |   3 +
 .../fluid/distributed/collective/HCCLTools.h  | 174 +++++++++
 .../collective/ProcessGroupHCCL.cc            | 356 ++++++++++++++++++
 .../distributed/collective/ProcessGroupHCCL.h | 152 ++++++++
 .../fluid/platform/device/npu/hccl_helper.h   |  17 +
 paddle/fluid/pybind/CMakeLists.txt            |   3 +
 paddle/fluid/pybind/distributed_py.cc         |  12 +
 .../tests/unittests/npu/process_group_hccl.py | 249 ++++++++++++
 .../npu/test_collective_process_group_hccl.py |  29 ++
 9 files changed, 995 insertions(+)
 create mode 100644 paddle/fluid/distributed/collective/HCCLTools.h
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHCCL.h
 create mode 100644 python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 96bc4a710f8..f88c993d85e 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
+if(WITH_ASCEND_CL)
+    cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
+endif()
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
new file mode 100644
index 00000000000..09789bd4d37
--- /dev/null
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <error.h>
+#include <string>
+
+#include "boost/variant.hpp"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index, device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index, device_index_));
+
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index, device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index, device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks, int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank,
+                                                       &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
new file mode 100644
index 00000000000..84f5ca48d25
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -0,0 +1,356 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+
+DECLARE_bool(hccl_blocking_wait);
+// DECLARE_bool(use_stream_safe_npu_allocator);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+static HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(), true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
+//   return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+//     return t.place() == platform::DeviceType::NPU;
+//   });
+// }
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<NPUEventManager>& hcclEvents,                   // NOLINT
+    std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    hcclEvents[i].Record(*dev_ctx[i]);
+    hcclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
+    std::vector<Place> places, int rank, CommType comm_type,
+    const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupHCCL::HCCLTask>(places, rank, comm_type,
+                                                      inputs);
+}
+
+ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector<Place>& places, int rank,
+                                     CommType CommType,
+                                     const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  hcclComms_.resize(places.size());
+}
+
+ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
+
+void ProcessGroupHCCL::HCCLTask::SetOutputs(
+    std::vector<Tensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
+}
+
+void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    platform::NPUStreamWaitEvent(default_ctx->stream(),
+                                 control_events_[i].GetRawNPUEvent());
+  }
+}
+
+bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO(sandyhouse): Add timeout for wait, now timeout unused
+bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  if (FLAGS_hccl_blocking_wait) {
+    // NOTE(sandyhouse): It will block host for sync
+    while (!IsCompleted()) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+    }
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                                   int rank, int size)
+    : ProcessGroup(rank, size), store_(store) {}
+
+void ProcessGroupHCCL::BroadcastUniqueHCCLID(
+    std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto hccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
+      store_->set(key, hccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&hccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+
+// create HCCLManager cache for places_key
+void ProcessGroupHCCL::CreateHCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+
+  std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
+  hccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<HcclRootInfo> hccl_ids;
+  hccl_ids.resize(1);
+  auto& hccl_id = hccl_ids.front();
+
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
+  }
+  BroadcastUniqueHCCLID(hccl_ids);
+
+  VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
+
+  std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id,
+                                            comms.get() + i);
+    dev_ctx[i].reset(new NPUDeviceContext(places[i]));
+  }
+
+  std::vector<NPUEventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
+    std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < inputs.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::PointToPoint(
+    std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, tensors);
+
+  // construct uninitialize guard for device
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < tensors.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank);
+  }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     NPUPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclAllReduce(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     CudaPlace."));
+
+  return Collective(
+      tensors, tensors,
+      [&](Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        const auto root = opts.source_rank * tensors.size() + opts.source_root;
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclBroadcast(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
new file mode 100644
index 00000000000..f2376b4eed7
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+constexpr const char* HCCL_BACKEND_NAME = "HCCL";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+using NPUStream = platform::stream::NPUStream;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class HCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<HCCLTask> {
+   public:
+    HCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
+             const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams();
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize();
+
+    void SetOutputs(std::vector<Tensor>& outputs);  // NOLINT
+
+    virtual ~HCCLTask();
+
+    std::vector<NPUEventManager> control_events_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
+    std::shared_ptr<std::vector<Tensor>> outputs_;
+
+   private:
+  };
+
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size);
+
+  const std::string GetBackendName() const override {
+    return std::string(HCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
+                                           int dst_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
+                                           int src_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
+      std::vector<Place> places, int rank, CommType opType,
+      const std::vector<Tensor>& inputs);
+
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<HCCLCommManager> hccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
+      places_to_hcclcomm_;
+
+  std::unordered_map<std::string, std::vector<NPUEventManager>>
+      places_to_events_;
+
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<NPUDeviceContext>>>
+      places_to_ctx_;
+
+  std::set<int> used_place_ids_;
+
+ private:
+  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<Tensor>& inputs,   // NOLINT
+      std::vector<Tensor>& outputs,  // NOLINT
+      Fn fn, CommType op_type);
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<Tensor>& tensors,  // NOLINT
+      Fn fn, int dst_rank, CommType op_type);
+
+  void CreateHCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index efbc56bee72..134ec04030d 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -53,6 +53,23 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline HcclDataType ToHCCLDataType(experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == experimental::DataType::FLOAT16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == experimental::DataType::INT64) {
+    return HCCL_DATA_TYPE_INT64;
+  } else if (type == experimental::DataType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == experimental::DataType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 7ff501ef43d..f40cd51a7b2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -88,6 +88,9 @@ if(NOT ON_INFER)
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
+  if(WITH_ASCEND)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
+  endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()
 
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 9870eab8da9..0b179670381 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -35,6 +35,10 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
 #include "paddle/fluid/distributed/store/tcp_store.h"
@@ -201,6 +205,14 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>());
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+  py::class_<distributed::ProcessGroupHCCL,
+             std::shared_ptr<distributed::ProcessGroupHCCL>>(
+      *m, "ProcessGroupHCCL", ProcessGroup)
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
+           py::call_guard<py::gil_scoped_release>());
+#endif
+
   py::class_<distributed::ProcessGroup::Task,
              std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
       .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted)
diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
new file mode 100644
index 00000000000..37a24885be1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupHCCL(store, rank, nranks)
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_nccl(self):
+        with _test_eager_guard():
+            paddle.set_device('npu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+            exit(0)
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+class TestProcessGroupFp16(TestProcessGroupFp32):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float16"
+        self.shape = (4, 20, 20)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
new file mode 100644
index 00000000000..9b2c6fae15e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestProcessGroup(TestMultipleGpus):
+    def test_process_group_nccl(self):
+        self.run_mnist_2gpu('process_group_hccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 00566eade8749566763af7e782224f3fed68bbdf Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Tue, 8 Mar 2022 16:47:20 +0800
Subject: [PATCH 184/272] Add exception throw for norm_conv when platform is
 not supported (#40166)

* Add throw for norm_conv when platform is not supported

* fix format
---
 .../operators/fused/cudnn_norm_conv_test.cc   | 42 ++++++++++++++++---
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index b3792a176fa..a80f590aa49 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -405,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 3, output_channels = input_channels
@@ -421,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, output_channels = input_channels * 4
@@ -437,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
-- 
GitLab


From 48b4366c707ab570d7012e213d3eccef73ac40a4 Mon Sep 17 00:00:00 2001
From: Yang <3349368+m3ngyang@users.noreply.github.com>
Date: Tue, 8 Mar 2022 16:51:44 +0800
Subject: [PATCH 185/272] [Phi] move ops: maxout/take_along_axis/put_along_axis
 (#39959)

* [Phi] move put_along_axis/take_along_axis/maxout

* use phi::Copy
---
 paddle/fluid/operators/math/maxouting.cc      | 151 +++++++++---------
 paddle/fluid/operators/math/maxouting.cu      | 107 +++++++------
 paddle/fluid/operators/math/maxouting.h       |   2 +-
 paddle/fluid/operators/maxout_op.cc           |  13 +-
 paddle/fluid/operators/maxout_op.cu.cc        |  24 ---
 paddle/fluid/operators/maxout_op.h            |  72 ---------
 paddle/fluid/operators/put_along_axis_op.cc   |  16 +-
 paddle/fluid/operators/put_along_axis_op.cu   | 134 ----------------
 paddle/fluid/operators/put_along_axis_op.h    | 124 --------------
 paddle/fluid/operators/take_along_axis_op.cc  |  16 +-
 paddle/fluid/operators/take_along_axis_op.cu  |  97 -----------
 paddle/fluid/operators/take_along_axis_op.h   |  92 -----------
 paddle/phi/kernels/CMakeLists.txt             |   8 +-
 paddle/phi/kernels/cpu/maxout_grad_kernel.cc  |  20 +++
 paddle/phi/kernels/cpu/maxout_kernel.cc       |  19 +++
 .../kernels/cpu/put_along_axis_grad_kernel.cc |  83 ++++++++++
 .../phi/kernels/cpu/put_along_axis_kernel.cc  |  87 ++++++++++
 .../cpu/take_along_axis_grad_kernel.cc        |  71 ++++++++
 .../phi/kernels/cpu/take_along_axis_kernel.cc |  60 +++++++
 paddle/phi/kernels/gpu/maxout_grad_kernel.cu  |  20 +++
 paddle/phi/kernels/gpu/maxout_kernel.cu       |  19 +++
 .../kernels/gpu/put_along_axis_grad_kernel.cu |  79 +++++++++
 .../phi/kernels/gpu/put_along_axis_kernel.cu  |  86 ++++++++++
 .../gpu/take_along_axis_grad_kernel.cu        |  72 +++++++++
 .../phi/kernels/gpu/take_along_axis_kernel.cu |  59 +++++++
 .../kernels/impl/maxout_grad_kernel_impl.h    |  45 ++++++
 paddle/phi/kernels/impl/maxout_kernel_impl.h  |  37 +++++
 paddle/phi/kernels/maxout_grad_kernel.h       |  30 ++++
 paddle/phi/kernels/maxout_kernel.h            |  28 ++++
 .../phi/kernels/put_along_axis_grad_kernel.h  |  33 ++++
 paddle/phi/kernels/put_along_axis_kernel.h    |  32 ++++
 .../phi/kernels/take_along_axis_grad_kernel.h |  29 ++++
 paddle/phi/kernels/take_along_axis_kernel.h   |  28 ++++
 paddle/phi/ops/compat/maxout_sig.cc           |  33 ++++
 paddle/phi/ops/compat/put_along_axis_sig.cc   |  38 +++++
 paddle/phi/ops/compat/take_along_axis_sig.cc  |  37 +++++
 36 files changed, 1191 insertions(+), 710 deletions(-)
 delete mode 100644 paddle/fluid/operators/maxout_op.cu.cc
 delete mode 100644 paddle/fluid/operators/maxout_op.h
 delete mode 100644 paddle/fluid/operators/put_along_axis_op.cu
 delete mode 100644 paddle/fluid/operators/put_along_axis_op.h
 delete mode 100644 paddle/fluid/operators/take_along_axis_op.cu
 delete mode 100644 paddle/fluid/operators/take_along_axis_op.h
 create mode 100644 paddle/phi/kernels/cpu/maxout_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/maxout_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/put_along_axis_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/take_along_axis_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/maxout_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/maxout_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/put_along_axis_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/take_along_axis_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/maxout_kernel_impl.h
 create mode 100644 paddle/phi/kernels/maxout_grad_kernel.h
 create mode 100644 paddle/phi/kernels/maxout_kernel.h
 create mode 100644 paddle/phi/kernels/put_along_axis_grad_kernel.h
 create mode 100644 paddle/phi/kernels/put_along_axis_kernel.h
 create mode 100644 paddle/phi/kernels/take_along_axis_grad_kernel.h
 create mode 100644 paddle/phi/kernels/take_along_axis_kernel.h
 create mode 100644 paddle/phi/ops/compat/maxout_sig.cc
 create mode 100644 paddle/phi/ops/compat/put_along_axis_sig.cc
 create mode 100644 paddle/phi/ops/compat/take_along_axis_sig.cc

diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 45556e97d1d..28ec3a87102 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -14,106 +14,107 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 // All tensors are in NCHW or NHWC format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          int input_idx, output_idx;
-          for (int ph = 0; ph < groups; ++ph) {
-            if (axis == 1) {
-              input_idx =
-                  (new_bindex + new_cindex) * groups + ph * fea_size + f;
-            } else {
-              input_idx = (new_bindex + f * output_channels + c) * groups + ph;
-            }
-            T x = input_data[input_idx];
-            ele = ele > x ? ele : x;
-          }
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+  int fea_size = input_height * input_width;
+  // c_size means the output size of each sample
+  int c_size = fea_size * output_channels;
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  for (int i = 0; i < batch_size; ++i) {
+    int new_bindex = c_size * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int new_cindex = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        T ele = static_cast<T>(-FLT_MAX);
+        int input_idx, output_idx;
+        for (int ph = 0; ph < groups; ++ph) {
           if (axis == 1) {
-            output_idx = new_bindex + new_cindex + f;
+            input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f;
           } else {
-            output_idx = new_bindex + f * output_channels + c;
+            input_idx = (new_bindex + f * output_channels + c) * groups + ph;
           }
-          output_data[output_idx] = ele;
+          T x = input_data[input_idx];
+          ele = ele > x ? ele : x;
         }
+        if (axis == 1) {
+          output_idx = new_bindex + new_cindex + f;
+        } else {
+          output_idx = new_bindex + f * output_channels + c;
+        }
+        output_data[output_idx] = ele;
       }
     }
   }
-};
+}
 
-template <class T>
-class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+  int fea_size = input_height * input_width;
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0, output_idx;
-          bool continue_match = true;
-          if (axis == 1) {
-            input_idx0 = (blen + clen) * groups + f;
-            output_idx = blen + clen + f;
-          } else {
-            input_idx0 = (blen + f * output_channels + c) * groups;
-            output_idx = blen + f * output_channels + c;
-          }
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int idx_offset = (axis == 1 ? fea_size * g : g);
-            int input_idx = input_idx0 + idx_offset;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
+  for (int i = 0; i < batch_size; ++i) {
+    int blen = fea_size * output_channels * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int clen = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        int input_idx0, output_idx;
+        bool continue_match = true;
+        if (axis == 1) {
+          input_idx0 = (blen + clen) * groups + f;
+          output_idx = blen + clen + f;
+        } else {
+          input_idx0 = (blen + f * output_channels + c) * groups;
+          output_idx = blen + f * output_channels + c;
+        }
+        for (int g = 0; g < groups && continue_match; ++g) {
+          int idx_offset = (axis == 1 ? fea_size * g : g);
+          int input_idx = input_idx0 + idx_offset;
+          if (input_data[input_idx] == output_data[output_idx]) {
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+            continue_match = false;
           }
         }
       }
     }
   }
-};
+}
 
 template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
 template class MaxOutFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::CPUContext, float>;
+template class MaxOutGradFunctor<phi::CPUContext, double>;
+template class MaxOutFunctor<phi::CPUContext, float>;
+template class MaxOutFunctor<phi::CPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 1856fb4eb48..1d0478db5ef 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = output->numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width, groups,
-        axis, output_data);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  int nthreads = output->numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, input_channels, input_height, input_width, groups,
+      axis, output_data);
+}
+
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = output.numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups, axis);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+  int nthreads = output.numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, output_data, output_grad_data, input_grad_data,
+      input_channels, input_height, input_width, groups, axis);
+}
 
 template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
@@ -157,6 +154,12 @@ template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 template class MaxOutFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::GPUContext, float>;
+template class MaxOutGradFunctor<phi::GPUContext, double>;
+
+template class MaxOutFunctor<phi::GPUContext, float>;
+template class MaxOutFunctor<phi::GPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 0d8372df8a2..1f4964f7715 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -30,7 +30,7 @@ class MaxOutFunctor {
                   const int axis = 1);
 };
 
-template <typename DeviceContext, class T>
+template <typename DeviceContext, typename T>
 class MaxOutGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index bd9ebd29777..e55369e0691 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -12,14 +12,14 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/fluid/operators/maxout_op.h"
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -130,10 +130,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
 REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc
deleted file mode 100644
index be1e81bb869..00000000000
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/maxout_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
deleted file mode 100644
index 92299829394..00000000000
--- a/paddle/fluid/operators/maxout_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MaxOutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
-    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
-                   groups, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MaxOutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
-      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups,
-                      axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
index 6b0d6f332bc..54e31845ad4 100644
--- a/paddle/fluid/operators/put_along_axis_op.cc
+++ b/paddle/fluid/operators/put_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/put_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker,
                   paddle::operators::PutAlongAxisInplaceInferer);
 
 REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel<float>,
-                       ops::PutAlongAxisOpKernel<double>,
-                       ops::PutAlongAxisOpKernel<int>,
-                       ops::PutAlongAxisOpKernel<uint8_t>,
-                       ops::PutAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis_grad,
-                       ops::PutAlongAxisGradOpKernel<float>,
-                       ops::PutAlongAxisGradOpKernel<double>,
-                       ops::PutAlongAxisGradOpKernel<int>,
-                       ops::PutAlongAxisGradOpKernel<uint8_t>,
-                       ops::PutAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
deleted file mode 100644
index 5508023efad..00000000000
--- a/paddle/fluid/operators/put_along_axis_op.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/put_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PutAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisCUDAKernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_input_grad_kernel<T, int32_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        gpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_gather_kernel<T, int32_t>(
-            *result_grad, axis, *index, *value_grad,
-            ctx.device_context());  // the gradient of scatter is gather
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel<float>,
-                        ops::PutAlongAxisCUDAKernel<double>,
-                        ops::PutAlongAxisCUDAKernel<int64_t>,
-                        ops::PutAlongAxisCUDAKernel<int>,
-                        ops::PutAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(put_along_axis_grad,
-                        ops::PutAlongAxisGradOpCUDAKernel<float>,
-                        ops::PutAlongAxisGradOpCUDAKernel<double>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int>,
-                        ops::PutAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
deleted file mode 100644
index 38487f5ce28..00000000000
--- a/paddle/fluid/operators/put_along_axis_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class PutAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisOpKernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce "
-          "op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpKernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_input_grad_kernel<T, int32_t>(
-            // Here passing an unused argument *result_grad, because it's
-            // convenient to instantiate a bunch of template function with the
-            // same arguments list.
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        cpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_gather_kernel<T, int32_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
index 664f1031915..fa8a5e92712 100644
--- a/paddle/fluid/operators/take_along_axis_op.cc
+++ b/paddle/fluid/operators/take_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/take_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp,
                   ops::TakeAlongAxisGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel<float>,
-                       ops::TakeAlongAxisOpKernel<double>,
-                       ops::TakeAlongAxisOpKernel<int>,
-                       ops::TakeAlongAxisOpKernel<uint8_t>,
-                       ops::TakeAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis_grad,
-                       ops::TakeAlongAxisGradOpKernel<float>,
-                       ops::TakeAlongAxisGradOpKernel<double>,
-                       ops::TakeAlongAxisGradOpKernel<int>,
-                       ops::TakeAlongAxisGradOpKernel<uint8_t>,
-                       ops::TakeAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
deleted file mode 100644
index b6c62d497b3..00000000000
--- a/paddle/fluid/operators/take_along_axis_op.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/take_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TakeAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel<float>,
-                        ops::TakeAlongAxisCUDAKernel<double>,
-                        ops::TakeAlongAxisCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisCUDAKernel<int>,
-                        ops::TakeAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(take_along_axis_grad,
-                        ops::TakeAlongAxisGradOpCUDAKernel<float>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<double>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
deleted file mode 100644
index fc781dbddf2..00000000000
--- a/paddle/fluid/operators/take_along_axis_op.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class TakeAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 58ea231beef..de3b5b53f46 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,11 +27,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
new file mode 100644
index 00000000000..429344a362b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc
new file mode 100644
index 00000000000..e7cd3ab07ff
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_kernel.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
new file mode 100644
index 00000000000..e94d09e0337
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int32_t>(
+          // Here passing an unused argument out_grad, because it's
+          // convenient to instantiate a bunch of template function with the
+          // same arguments list.
+          out_grad,
+          axis,
+          index,
+          *x_grad,
+          dev_ctx);
+    } else {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_gather_kernel<T, int32_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
new file mode 100644
index 00000000000..83c9a915ee6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce "
+        "op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
new file mode 100644
index 00000000000..4443383f402
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
new file mode 100644
index 00000000000..502db8a22da
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
new file mode 100644
index 00000000000..86ff09fd74b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu
new file mode 100644
index 00000000000..88776a49f19
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
new file mode 100644
index 00000000000..f553da361f1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int32_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    } else {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_gather_kernel<T, int32_t>(
+          out_grad,
+          axis,
+          index,
+          *value_grad,
+          dev_ctx);  // the gradient of scatter is gather
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
new file mode 100644
index 00000000000..d363c0c2836
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisCUDAKernel only runs on GPU device."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
new file mode 100644
index 00000000000..e09cfd370a4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
new file mode 100644
index 00000000000..63113e3e672
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU device."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace  phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
new file mode 100644
index 00000000000..546ea746742
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  phi::funcs::SetConstant<Context, T> zero;
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    zero(dev_ctx, x_grad, static_cast<T>(0.0));
+    paddle::operators::math::MaxOutGradFunctor<Context, T> maxout_backward;
+    maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h
new file mode 100644
index 00000000000..da8c259ebf2
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  paddle::operators::math::MaxOutFunctor<Context, T> maxout_forward;
+  maxout_forward(dev_ctx, x, out, groups, axis);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_grad_kernel.h b/paddle/phi/kernels/maxout_grad_kernel.h
new file mode 100644
index 00000000000..1ee4e8cc896
--- /dev/null
+++ b/paddle/phi/kernels/maxout_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_kernel.h b/paddle/phi/kernels/maxout_kernel.h
new file mode 100644
index 00000000000..e582575678d
--- /dev/null
+++ b/paddle/phi/kernels/maxout_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h
new file mode 100644
index 00000000000..2141443da7a
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h
new file mode 100644
index 00000000000..797d0e364b4
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/take_along_axis_grad_kernel.h b/paddle/phi/kernels/take_along_axis_grad_kernel.h
new file mode 100644
index 00000000000..a312c235f66
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/take_along_axis_kernel.h b/paddle/phi/kernels/take_along_axis_kernel.h
new file mode 100644
index 00000000000..e8fb78556d9
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out);
+
+}  // namespace  phi
diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc
new file mode 100644
index 00000000000..d16dd1c8617
--- /dev/null
+++ b/paddle/phi/ops/compat/maxout_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"});
+}
+
+KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"groups", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc
new file mode 100644
index 00000000000..5f8dc1cf4cd
--- /dev/null
+++ b/paddle/phi/ops/compat/put_along_axis_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis",
+                         {"Input", "Index", "Value"},
+                         {"Axis", "Reduce"},
+                         {"Result"});
+}
+
+KernelSignature PutAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis", "Reduce"},
+                         {GradVarName("Input"), GradVarName("Value")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad,
+                           phi::PutAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc
new file mode 100644
index 00000000000..27a996a270d
--- /dev/null
+++ b/paddle/phi/ops/compat/take_along_axis_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TakeAlongAxisArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"});
+}
+
+KernelSignature TakeAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("take_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis"},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad,
+                           phi::TakeAlongAxisGradArgumentMapping);
-- 
GitLab


From d4a4eb9d68d1d6ca8025fefbfee1dfb98a9170d0 Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Tue, 8 Mar 2022 17:05:50 +0800
Subject: [PATCH 186/272] Fix fold python examples (#38636)

* fix fold python examples, test=develop

* fix size type, test=develop

* fix python example, test=develop

* fix fold shape check

* fix fold dygraph mode, test=develop
---
 paddle/fluid/operators/fold_op.cc             | 22 +++++-
 .../fluid/tests/unittests/test_fold_op.py     | 10 +++
 python/paddle/nn/functional/common.py         | 68 +++++++++++--------
 python/paddle/nn/layer/common.py              | 15 ++--
 4 files changed, 76 insertions(+), 39 deletions(-)

diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 40ec9aef190..92f59e118c3 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel {
                           "but recieved strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
+    PADDLE_ENFORCE_GT(output_height, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_height` should be greater than one, "
+                          "but recieved output_height: %d .",
+                          output_height));
+    PADDLE_ENFORCE_GT(output_width, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_width` should be greater than one, "
+                          "but recieved output_width: %d .",
+                          output_width));
+    // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
@@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel {
             output_width));
 
     PADDLE_ENFORCE_EQ(
-        blocks_height * blocks_width, in_dims[1],
+        blocks_height * blocks_width, in_dims[2],
         platform::errors::InvalidArgument(
             "Given input output_size (%d, %d), "
             "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
@@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel {
             strides[0], strides[1], dilations[0], dilations[1], blocks_height,
             blocks_width, blocks_height * blocks_width, in_dims[2]));
 
+    PADDLE_ENFORCE_EQ(
+        in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0,
+        platform::errors::InvalidArgument(
+            "Expected size of input's dimension 1 to be divisible by the"
+            "product of kernel_size, but got input.size(1)=%d and "
+            "kernel_size=( %d"
+            ", %d).",
+            in_dims[1], kernel_sizes[0], kernel_sizes[1]));
+
     out_dims.push_back(output_height);
     out_dims.push_back(output_width);
     ctx->SetOutputDim("Y", phi::make_ddim(out_dims));
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 14a59b41338..44b94cd3b66 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -174,6 +174,15 @@ class TestFoldOpError(unittest.TestCase):
                     x, output_sizes=[6, 6], kernel_sizes=[2, 2],
                     strides=[1, 1])
 
+            def test_output_size_2():
+                # out_size must GT 1
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[0.1, 0.2],
+                    kernel_sizes=[2, 2],
+                    strides=[1, 1])
+
             def test_block_h_w():
                 # test_block_h_w GT 0
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
@@ -196,6 +205,7 @@ class TestFoldOpError(unittest.TestCase):
             self.assertRaises(AssertionError, test_dilations_shape)
             self.assertRaises(AssertionError, test_strides_shape)
             self.assertRaises(ValueError, test_output_size)
+            self.assertRaises(ValueError, test_output_size_2)
             self.assertRaises(ValueError, test_block_h_w)
             self.assertRaises(ValueError, test_GT_0)
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index ed668ed124c..9e78ca6be3f 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -351,7 +351,6 @@ def interpolate(x,
 
     out_shape = size
     scale = scale_factor
-
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
@@ -362,6 +361,8 @@ def interpolate(x,
             if in_dynamic_mode():
                 if isinstance(out_shape, Variable):
                     out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
                 for i, dim in enumerate(out_shape):
                     if isinstance(dim, Variable):
                         out_shape[i] = dim.numpy()[0]
@@ -1818,7 +1819,6 @@ def fold(x,
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1826,21 +1826,21 @@ def fold(x,
     Parameters:
         x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
                                   data type can be float32 or float64
-        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+        output_sizes(int|list|tuple):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1859,9 +1859,9 @@ def fold(x,
             import paddle
             import paddle.nn.functional as F
 
-            x = paddle.randn([2,12,9])
-            y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2)
-            # y.shape = [2,3,4,4]
+            x = paddle.randn([2,3*2*2,12])
+            y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
+            # y.shape = [2,3,4,5]
 
     """
 
@@ -1872,29 +1872,32 @@ def fold(x,
     assert len(x.shape) == 3, \
             "input should be the format of [N, C, L]"
 
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
     if isinstance(output_sizes, int):
         output_sizes = [output_sizes, output_sizes]
     else:
-        assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \
-            "output_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \
+            "output_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(strides) and (len(strides) == 2), \
+            "strides should either be an integer or a list/tuple of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list/tuple of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -1912,16 +1915,21 @@ def fold(x,
             "Unexpected type of paddings, it should be either an integer or a list"
             "of 2 or 4 integers")
 
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="fold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "output_sizes": output_sizes,
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
+    if in_dynamic_mode():
+        out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes",
+                          kernel_sizes, "strides", strides, "paddings",
+                          paddings, "dilations", dilations)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type="fold",
+            inputs={"X": x},
+            outputs={"Y": out},
+            attrs={
+                "output_sizes": output_sizes,
+                "kernel_sizes": kernel_sizes,
+                "strides": strides,
+                "paddings": paddings,
+                "dilations": dilations
+            })
     return out
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 19fbcd5b6f8..dac4cf5f272 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1565,7 +1565,6 @@ class Fold(Layer):
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1573,19 +1572,19 @@ class Fold(Layer):
     Parameters:
         output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1604,10 +1603,10 @@ class Fold(Layer):
             import paddle
             import paddle.nn as nn
 
-            x = paddle.randn([2,12,9])
-            fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2)
+            x = paddle.randn([2,3*2*2,12])
+            fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2)
             y = fold(x)
-            # y.shape = [2,3,4,4]
+            # y.shape = [2,3,4,5]
    """
 
     def __init__(self,
-- 
GitLab


From 2ce007cae0a2307997d8ffc43292fd505246e36b Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Tue, 8 Mar 2022 17:41:29 +0800
Subject: [PATCH 187/272] remove isinstance Dataset check. test=develop
 (#40184)

---
 python/paddle/fluid/dataloader/batch_sampler.py | 2 --
 python/paddle/fluid/reader.py                   | 2 --
 2 files changed, 4 deletions(-)

diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 3debeecfe4f..3a23c852563 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -113,8 +113,6 @@ class BatchSampler(Sampler):
             assert not shuffle, "shuffle should be False when sampler is set"
             self.sampler = sampler
         else:
-            assert isinstance(dataset, Dataset), \
-                "dataset should be a paddle.io.Dataset"
             assert not isinstance(dataset, IterableDataset), \
                 "dataset should not be a paddle.io.IterableDataset"
             assert sampler is None, \
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 727ceca72d1..cbea289162c 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -332,8 +332,6 @@ class DataLoader(object):
         self.use_buffer_reader = use_buffer_reader
         self.worker_init_fn = worker_init_fn
 
-        assert isinstance(dataset, Dataset), \
-            "dataset should be subclass instance of paddle.io.Dataset"
         self.dataset = dataset
 
         if not return_list and not in_dygraph_mode():
-- 
GitLab


From 9aa6bfc7e1cfce657109789995d153b6bcdf74d7 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Tue, 8 Mar 2022 17:42:29 +0800
Subject: [PATCH 188/272] fix yolov3 return value in dygraph mode. test=develop
 (#40185)

---
 python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py | 1 +
 python/paddle/vision/ops.py                                | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 1ec1d1527e1..3f0e4f7a400 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -305,6 +305,7 @@ class TestYolov3LossDygraph(unittest.TestCase):
             use_label_smooth=True,
             scale_x_y=1.)
         assert loss is not None
+        assert loss.shape == [2]
         paddle.enable_static()
 
 
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 03060e92bdb..4983ca49ac3 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -195,7 +195,7 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode() and gt_score is None:
-        loss = _C_ops.yolov3_loss(
+        loss, _, _ = _C_ops.yolov3_loss(
             x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask,
             'class_num', class_num, 'ignore_thresh', ignore_thresh,
             'downsample_ratio', downsample_ratio, 'use_label_smooth',
-- 
GitLab


From 3a77d027b143b19a9c26bdc7e77e0902ff2a7feb Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 8 Mar 2022 20:18:03 +0800
Subject: [PATCH 189/272] [Phi] Remove gpudnn suffix & polish cmake (#40239)

* remove gpudnn suffix & polish cmake

* fix typo
---
 cmake/phi.cmake                               | 122 +++++++++---------
 ...nel_gpudnn.cu => conv_grad_grad_kernel.cu} |   0
 ...d_kernel_gpudnn.cu => conv_grad_kernel.cu} |   0
 .../{conv_kernel_gpudnn.cu => conv_kernel.cu} |   0
 ...ernel_gpudnn.cu => softmax_grad_kernel.cu} |   0
 ...max_kernel_gpudnn.cu => softmax_kernel.cu} |   0
 6 files changed, 59 insertions(+), 63 deletions(-)
 rename paddle/phi/kernels/gpudnn/{conv_grad_grad_kernel_gpudnn.cu => conv_grad_grad_kernel.cu} (100%)
 rename paddle/phi/kernels/gpudnn/{conv_grad_kernel_gpudnn.cu => conv_grad_kernel.cu} (100%)
 rename paddle/phi/kernels/gpudnn/{conv_kernel_gpudnn.cu => conv_kernel.cu} (100%)
 rename paddle/phi/kernels/gpudnn/{softmax_grad_kernel_gpudnn.cu => softmax_grad_kernel.cu} (100%)
 rename paddle/phi/kernels/gpudnn/{softmax_kernel_gpudnn.cu => softmax_kernel.cu} (100%)

diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index f6e15758379..ebb686d8ad0 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -134,8 +134,8 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
             endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
-                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
+                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
             endif()
         endif()
         if (WITH_XPU)
@@ -197,92 +197,88 @@ function(kernel_library TARGET)
 
     # kernel source file level
     # level 1: base device kernel
-    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+    # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
     # level 2: device-independent kernel
     # - common_srcs
     # level 3: Kernel implemented by reusing device-independent kernel
     # - selected_rows_srcs
+    set(base_device_kernels)
+    set(device_independent_kernel)
+    set(high_level_kernels)
 
-    # Build Target according different src organization
-    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
-        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
-        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
+    # 1. Base device kernel compile
+    if (${cpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_cpu)
+    endif()
+    if (${gpu_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        elseif (WITH_XPU_KP)
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-                xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpu)
+    endif()
+    if (${xpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_xpu)
+    endif()
+    if (${gpudnn_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        elseif (WITH_XPU_KP)
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-                xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If the selected_rows_srcs depends on common_srcs, build target using this rule.
-    elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpudnn)
+    endif()
+    if (${kps_srcs_len} GREATER 0)
+        # only when WITH_XPU_KP, the kps_srcs_len can be > 0
+        xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_kps)
+    endif()
+
+    # 2. Device-independent kernel compile
+    if (${common_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         else()
-            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         endif()
-    # If there are only common_srcs or selected_rows_srcs, build target using below rules.
-    elseif (${common_srcs_len} GREATER 0)
+        list(APPEND device_independent_kernel ${TARGET}_common)
+    endif()
+
+    # 3. Reusing kernel compile
+    if (${selected_rows_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         endif()
-    elseif (${selected_rows_srcs_len} GREATER 0)
+        list(APPEND high_level_kernels ${TARGET}_sr)
+    endif()
+
+    # 4. Unify target compile
+    list(LENGTH base_device_kernels base_device_kernels_len)
+    list(LENGTH device_independent_kernel device_independent_kernel_len)
+    list(LENGTH high_level_kernels high_level_kernels_len)
+    if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
+        ${high_level_kernels_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         else()
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         endif()
     else()
         set(target_build_flag 0)
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/conv_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu
-- 
GitLab


From c1d81ec13cec96729f3902455e1038eb6e6280cf Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 8 Mar 2022 21:12:04 +0800
Subject: [PATCH 190/272] Add profiler statistic (#40249)

* add python profiler package

* update according to review

* fix bug

* fix bug

* fix bug

* add unit test

* Revert "add unit test"

This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48.

* reduce for pr

* add unit test

* modify for pr

* fix unittest

* update for ci coverage

* modify according to review

* fix bug

* improve coverage

* add profiler code

* add statistic code

* reduce content for pr
---
 .../unittests/test_profiler_statistic.py      | 199 +++++
 python/paddle/profiler/profiler_statistic.py  | 793 ++++++++++++++++++
 2 files changed, 992 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_profiler_statistic.py
 mode change 100644 => 100755 python/paddle/profiler/profiler_statistic.py

diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
new file mode 100644
index 00000000000..838ccae37cf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.profiler as profiler
+
+
+class HostPythonNode:
+    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+
+
+class DevicePythonNode:
+    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
+                 stream_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.device_id = device_id
+        self.context_id = context_id
+        self.stream_id = stream_id
+
+
+class TestProfilerStatistic(unittest.TestCase):
+    def test_statistic_case1(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.append(sync_batch_norm_node)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 90)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 55)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 30)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 75)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 15)
+        self.assertEqual(len(event_summary.items), 2)
+        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 135)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
old mode 100644
new mode 100755
index 29d586268a0..e39871c7365
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -16,6 +16,20 @@ from enum import Enum
 
 from paddle.fluid.core import TracerEventType
 
+from .statistic_helper import *
+
+_AllTracerEventType = [
+    TracerEventType.Operator, TracerEventType.Dataloader,
+    TracerEventType.ProfileStep, TracerEventType.CudaRuntime,
+    TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset,
+    TracerEventType.UserDefined, TracerEventType.OperatorInner,
+    TracerEventType.Forward, TracerEventType.Backward,
+    TracerEventType.Optimization, TracerEventType.Communication,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
+
 
 class SortedKeys(Enum):
     r"""
@@ -29,3 +43,782 @@ class SortedKeys(Enum):
     GPUAvg = 5
     GPUMax = 6
     GPUMin = 7
+
+
+class HostStatisticNode:
+    r'''
+    Wrap original node for calculating statistic metrics.
+    '''
+
+    def __init__(self, hostnode):
+        self.hostnode = hostnode
+        self.children_node = []
+        self.runtime_node = []
+        self.cpu_time = 0
+        self.self_cpu_time = 0
+        self.gpu_time = 0
+        self.self_gpu_time = 0
+
+    def cal_statistic(self):
+        for child in self.children_node:
+            child.cal_statistic()
+        for rt in self.runtime_node:
+            rt.cal_statistic()
+
+        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
+        for child in self.children_node:
+            self.gpu_time += child.gpu_time
+            self.self_cpu_time -= (child.end_ns - child.start_ns)
+        for rt in self.runtime_node:
+            self.self_cpu_time -= (rt.end_ns - rt.start_ns)
+            self.gpu_time += rt.gpu_time
+            self.self_gpu_time += rt.gpu_time
+        for device in self.hostnode.device_node:
+            self.gpu_time += (device.end_ns - device.start_ns)
+            self.self_gpu_time += (device.end_ns - device.start_ns)
+
+    @property
+    def end_ns(self):
+        return self.hostnode.end_ns
+
+    @property
+    def start_ns(self):
+        return self.hostnode.start_ns
+
+    def __getattr__(self, name):
+        return getattr(self.hostnode, name)
+
+
+def traverse_tree(nodetrees):
+    results = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        threadlist = results[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+    return results
+
+
+def wrap_tree(nodetrees):
+    '''
+    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
+    '''
+    node_statistic_tree = {}
+    results = collections.defaultdict(list)
+    newresults = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        root_statistic_node = HostStatisticNode(rootnode)
+        newstack = []
+        newstack.append(root_statistic_node)
+        node_statistic_tree[thread_id] = root_statistic_node
+        threadlist = results[thread_id]
+        newthreadlist = newresults[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            current_statistic_node = newstack.pop()
+            newthreadlist.append(current_statistic_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+                child_statistic_node = HostStatisticNode(childnode)
+                current_statistic_node.children_node.append(
+                    child_statistic_node)
+                newstack.append(child_statistic_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_statistic_node = HostStatisticNode(runtimenode)
+                current_statistic_node.runtime_node.append(
+                    runtime_statistic_node)
+    # recursive calculate node statistic values
+    for thread_id, root_statistic_node in node_statistic_tree.items():
+        root_statistic_node.cal_statistic()
+
+    return node_statistic_tree, newresults
+
+
+class TimeRangeSummary:
+    r"""
+    Analyse time ranges for each TracerEventType, and summarize the time.
+    """
+
+    def __init__(self):
+        self.CPUTimeRange = collections.defaultdict(list)
+        self.GPUTimeRange = collections.defaultdict(
+            lambda: collections.defaultdict(list)
+        )  # GPU events should be divided into different devices
+        self.CPUTimeRangeSum = collections.defaultdict(int)
+        self.GPUTimeRangeSum = collections.defaultdict(
+            lambda: collections.defaultdict(int))
+        self.call_times = collections.defaultdict(int)
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis node trees in profiler result, and get time range for different tracer event type.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            CPUTimeRange = collections.defaultdict(list)
+            GPUTimeRange = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
+            )  # device_id/type/stream_id
+            for hostnode in hostnodes[1:]:  #skip root node
+                CPUTimeRange[hostnode.type].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.call_times[hostnode.type] += 1
+                if hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in
+                     _CommunicationOpName]):  # special case, communication op
+                    CPUTimeRange[TracerEventType.Communication].append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    self.call_times[TracerEventType.Communication] += 1
+                is_communication_node = (
+                    hostnode.type == TracerEventType.Communication
+                ) or (hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in _CommunicationOpName]))
+                for runtimenode in hostnode.runtime_node:
+                    CPUTimeRange[runtimenode.type].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                    self.call_times[runtimenode.type] += 1
+                    for devicenode in runtimenode.device_node:
+                        GPUTimeRange[devicenode.device_id][devicenode.type][
+                            devicenode.stream_id].append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        self.call_times[devicenode.type] += 1
+                        if is_communication_node:  # gpu activity for communication node
+                            GPUTimeRange[devicenode.device_id][
+                                TracerEventType.Communication][
+                                    devicenode.stream_id].append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                            self.call_times[TracerEventType.Communication] += 1
+
+            for event_type, time_ranges in CPUTimeRange.items():
+                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
+                self.CPUTimeRange[event_type] = merge_ranges(
+                    self.CPUTimeRange[event_type], time_ranges, is_sorted=True)
+            for device_id, device_time_ranges in GPUTimeRange.items():
+                for event_type, event_time_ranges in device_time_ranges.items():
+                    for stream_id, time_ranges in event_time_ranges.items():
+                        time_ranges = merge_self_ranges(
+                            time_ranges, is_sorted=False)
+                        self.GPUTimeRange[device_id][event_type] = merge_ranges(
+                            self.GPUTimeRange[device_id][event_type],
+                            time_ranges,
+                            is_sorted=True)
+
+        for event_type, time_ranges in self.CPUTimeRange.items():
+            self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
+        for device_id, device_time_ranges in self.GPUTimeRange.items():
+            for event_type, time_ranges in device_time_ranges.items():
+                self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
+                    time_ranges)
+
+    def get_gpu_devices(self):
+        return self.GPUTimeRange.keys()
+
+    def get_gpu_range_sum(self, device_id, event_type):
+        return self.GPUTimeRangeSum[device_id][event_type]
+
+    def get_cpu_range_sum(self, event_type):
+        return self.CPUTimeRangeSum[event_type]
+
+
+class EventSummary:
+    r"""
+    Analyse operator event in profiling data, correlate with its device event.
+    """
+
+    class DeviceItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_item(self, node):
+            self.call += 1
+            self.add_gpu_time(node.end_ns - node.start_ns)
+
+    class OperatorItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.gpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+            self.devices = {}
+            self.operator_inners = {}
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+            for child in node.children_node:
+                if child.name not in self.operator_inners:
+                    self.operator_inners[
+                        child.name] = EventSummary.OperatorItem(child.name)
+                self.operator_inners[child.name].add_item(child)
+
+            for runtimenode in node.runtime_node:
+                for devicenode in runtimenode.device_node:
+                    if devicenode.name not in self.devices:
+                        self.devices[devicenode.name] = EventSummary.DeviceItem(
+                            devicenode.name)
+                    self.devices[devicenode.name].add_item(devicenode)
+
+    class GeneralItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+
+    def __init__(self):
+        self.items = {}  # for operator summary
+        self.thread_items = collections.defaultdict(
+            dict)  # for operator summary
+        self.userdefined_items = {}  # for userdefined summary
+        self.userdefined_thread_items = collections.defaultdict(
+            dict)  # for userdefined summary
+        self.model_perspective_items = {}  # for model summary
+        self.memory_manipulation_items = {}  # for memory manipulation summary
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis operator event in the nodetress.
+        """
+        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
+        for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
+        ):
+            for host_statistic_node in host_statistic_nodes[
+                    1:]:  #skip root node
+                if host_statistic_node.type == TracerEventType.Operator:
+                    self.add_operator_item(host_statistic_node)
+                if host_statistic_node.type == TracerEventType.UserDefined\
+                    or host_statistic_node.type == TracerEventType.PythonUserDefined:
+                    if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\
+                        or 'memset' in host_statistic_node.name.lower():
+                        self.add_memory_manipulation_item(host_statistic_node)
+                    else:
+                        self.add_userdefined_item(host_statistic_node)
+
+        for threadid, root_statistic_node in node_statistic_trees.items():
+            deque = collections.deque()
+            deque.append(root_statistic_node)
+            while deque:
+                current_node = deque.popleft()
+                for child in current_node.children_node:
+                    if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\
+                        or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization:
+                        self.add_model_perspective_item(
+                            child)  #find first model perspective node
+                    else:
+                        deque.append(child)
+
+    def add_operator_item(self, operator_node):
+        if operator_node.name not in self.items:
+            self.items[operator_node.name] = EventSummary.OperatorItem(
+                operator_node.name)
+
+        self.items[operator_node.name].add_item(operator_node)
+
+        if operator_node.name not in self.thread_items[operator_node.thread_id]:
+            self.thread_items[operator_node.thread_id][
+                operator_node.name] = EventSummary.OperatorItem(
+                    operator_node.name)
+        self.thread_items[operator_node.thread_id][operator_node.name].add_item(
+            operator_node)
+
+    def add_userdefined_item(self, userdefined_node):
+        if userdefined_node.name not in self.userdefined_items:
+            self.userdefined_items[
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+
+        self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
+
+        if userdefined_node.name not in self.userdefined_thread_items[
+                userdefined_node.thread_id]:
+            self.userdefined_thread_items[userdefined_node.thread_id][
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+        self.userdefined_thread_items[userdefined_node.thread_id][
+            userdefined_node.name].add_item(userdefined_node)
+
+    def add_memory_manipulation_item(self, memory_manipulation_node):
+        if memory_manipulation_node.name not in self.memory_manipulation_items:
+            self.memory_manipulation_items[
+                memory_manipulation_node.name] = EventSummary.GeneralItem(
+                    memory_manipulation_node.name)
+        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
+            memory_manipulation_node)
+
+    def add_model_perspective_item(self, model_perspective_node):
+        if model_perspective_node.type == TracerEventType.Forward:
+            name = 'Forward'
+        elif model_perspective_node.type == TracerEventType.Backward:
+            name = 'Backward'
+        elif model_perspective_node.type == TracerEventType.Optimization:
+            name = 'Optimization'
+        elif model_perspective_node.type == TracerEventType.Dataloader:
+            name = 'Dataloader'
+        else:
+            return
+        if name not in self.model_perspective_items:
+            self.model_perspective_items[name] = EventSummary.GeneralItem(name)
+        self.model_perspective_items[name].add_item(model_perspective_node)
+
+
+class StatisticData:
+    r"""
+    Hold all analysed results.
+    """
+
+    def __init__(self, node_trees, extra_info):
+        self.node_trees = node_trees
+        self.extra_info = extra_info
+        self.time_range_summary = TimeRangeSummary()
+        self.event_summary = EventSummary()
+        self.time_range_summary.parse(node_trees)
+        self.event_summary.parse(node_trees)
+
+
+def _build_table(statistic_data,
+                 sorted_by=SortedKeys.CPUTotal,
+                 op_detail=True,
+                 thread_sep=False,
+                 time_unit='ms',
+                 row_limit=100,
+                 max_src_column_width=75):
+    """Prints a summary of events."""
+    # format table row
+    SPACING_SIZE = 2
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    def add_column(padding, text_dir='<'):
+        row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + (
+            ' ' * SPACING_SIZE)
+        header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
+        line_length_list[0] += padding + SPACING_SIZE
+
+    def add_title(padding, text):
+        left_length = padding - len(text)
+        half = left_length // 2
+        return '-' * half + text + '-' * (left_length - half)
+
+    result = []
+
+    def append(s):
+        result.append(s)
+        result.append('\n')
+
+    def format_time(time, unit='ms', indent=0):
+        r"""
+        Transform time in ns to time in unit.
+        """
+        if time == float('inf'):
+            return '-'
+        else:
+            result = float(time)
+            if unit == 's':
+                result /= 1e9
+            elif unit == 'ms':
+                result /= 1e6
+            elif unit == 'us':
+                result /= 1e3
+            return '{}{:.2f}'.format(' ' * indent, result)
+
+    def format_ratio(ratio, indent=0):
+        r"""
+        Transform ratio within [0, 1] to percentage presentation.
+        """
+        return '{}{:.2f}'.format(' ' * indent, ratio * 100)
+
+    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
+        TracerEventType.ProfileStep)
+    ###### Print Device Summary ######
+    headers = ['Device', 'Utilization (%)']
+    name_column_width = 30
+    DEFAULT_COLUMN_WIDTH = 20
+    add_column(name_column_width)
+    for _ in headers[1:]:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+
+    append(add_title(line_length, "Device Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'CPU(Process)', format_ratio(
+            float(statistic_data.extra_info['Process Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    row_values = [
+        'CPU(System)', format_ratio(
+            float(statistic_data.extra_info['System Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
+        gpu_time = float(
+            statistic_data.time_range_summary.get_gpu_range_sum(
+                gpu_name, TracerEventType.Kernel))
+        utilization = gpu_time / total_time
+        row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
+        "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
+        "GPU Utilization = Current process GPU time / elapsed time")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    if total_time == 0:
+        return ''.join(result)
+
+    ###### Print Overview Summary ######
+    headers = ['Event Type', 'CPU Time', 'Ratio (%)']
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    DEFAULT_COLUMN_WIDTH = 25
+    for _ in headers:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+    append(add_title(line_length, "Overview Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'Total Time', format_time(
+            total_time, unit=time_unit), format_ratio(1)
+    ]
+    append(row_format.format(*row_values))
+    cpu_type_time = collections.defaultdict(int)
+    gpu_type_time = collections.defaultdict(int)
+    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
+    ):
+        cpu_type_time[event_type] = value
+
+    gpu_time_range = collections.defaultdict(list)
+    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+    ):
+        for event_type, time_range in device_time_ranges.items():
+            gpu_time_range[event_type] = merge_ranges(
+                gpu_time_range[event_type], time_range, is_sorted=True)
+    for event_type, time_range in gpu_time_range.items():
+        gpu_type_time[event_type] = sum_ranges(time_range)
+
+    sorted_items = sorted(
+        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    for event_type, time in sorted_items:
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+    append(header_sep)
+    headers = ['', 'GPU Time', 'Ratio (%)']
+    append(row_format.format(*headers))
+    append(header_sep)
+    for event_type, time in gpu_type_time.items():
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
+        "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
+        "ratio = CPU(GPU) Time / Total Time."
+        "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
+        "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
+        "Example:\n"
+        "Thread 1:\n"
+        "  Operator: |___________|     |__________|\n"
+        "Thread 2:\n"
+        "  Operator:   |____________|     |___|\n"
+        "After merged:\n"
+        "  Result:   |______________|  |__________|\n")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    ###### Print Operator Summary Report ######
+    if statistic_data.event_summary.items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 50
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Operator Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            thread_items = statistic_data.event_summary.thread_items
+        else:
+            thread_items = {
+                'All threads merged': statistic_data.event_summary.items
+            }
+        for thread_id, items in thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            total_cpu_time = 0
+            total_gpu_time = 0
+            for name, item in sorted_items:
+                total_cpu_time += item.cpu_time
+                total_gpu_time += item.gpu_time
+            for name, item in sorted_items:
+                row_values = [
+                    name, item.call, '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                ]
+                append(row_format.format(*row_values))
+                if op_detail:
+                    for innerop_name, innerop_node in item.operator_inners.items(
+                    ):
+                        row_values = [
+                            '  {}'.format(innerop_name), innerop_node.call,
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.cpu_time) /
+                                    total_cpu_time)),
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+                        for device_node_name, devicenode in innerop_node.devices.items(
+                        ):
+                            if len(device_node_name) + 4 > name_column_width:
+                                device_node_name = device_node_name[:
+                                                                    name_column_width
+                                                                    - 7]
+                                device_node_name += "..."
+                            row_values = [
+                                '    {}'.format(device_node_name),
+                                devicenode.call, '- / - / - / - / -',
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(
+                                        devicenode.gpu_time, unit=time_unit),
+                                    format_time(
+                                        devicenode.avg_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.max_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.min_gpu_time,
+                                        unit=time_unit),
+                                    format_ratio(
+                                        float(devicenode.gpu_time) /
+                                        total_gpu_time))
+                            ]
+                            append(row_format.format(*row_values))
+                    for device_node_name, device_node in item.devices.items():
+                        if len(device_node_name) + 2 > name_column_width:
+                            device_node_name = device_node_name[:
+                                                                name_column_width
+                                                                - 5]
+                            device_node_name += "..."
+                        row_values = [
+                            '    {}'.format(device_node_name), devicenode.call,
+                            '- / - / - / - / -',
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    devicenode.gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(devicenode.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    return ''.join(result)
-- 
GitLab


From 688743bf7ce7846873481dc5fdc2454c6e2de4f6 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Tue, 8 Mar 2022 21:22:17 +0800
Subject: [PATCH 191/272] Rename phi::func::TensorReduceImpl to
 phi::func::ReduceKernel. (#40183)

---
 .../fluid/operators/reduce_ops/reduce_op.cu.h |  4 +--
 paddle/phi/kernels/funcs/matrix_reduce.cu     |  9 ++----
 paddle/phi/kernels/funcs/reduce_function.h    | 12 ++++----
 .../gpu/broadcast_tensors_grad_kernel.cu      |  5 ++--
 paddle/phi/kernels/gpu/compare_kernel.cu      |  4 +--
 paddle/phi/kernels/gpu/elementwise_grad.h     | 29 +++++++------------
 paddle/phi/kernels/gpu/reduce.h               | 24 +++++----------
 ...d_cross_entropy_with_logits_grad_kernel.cu | 17 +++--------
 ...igmoid_cross_entropy_with_logits_kernel.cu | 18 +++---------
 paddle/phi/kernels/gpu/trace_kernel.cu        |  5 ++--
 .../kernels/impl/matmul_grad_kernel_impl.h    |  5 ++--
 11 files changed, 44 insertions(+), 88 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index eb76eee1048..16061769533 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
                       gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
-  phi::funcs::TensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+  phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
       static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
-      origin_reduce_dims, stream);
+      origin_reduce_dims);
 }
 
 }  // namespace operators
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu
index 5e288c6e9c2..5c3ebd6bb01 100644
--- a/paddle/phi/kernels/funcs/matrix_reduce.cu
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cu
@@ -45,13 +45,8 @@ class MatrixReduceSumFunctor<T, GPUContext> {
         out_reduce_dims.push_back(idx);
       }
     }
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dev_ctx,
-        in,
-        out,
-        kps::IdentityFunctor<T>(),
-        out_reduce_dims,
-        dev_ctx.stream());
+    ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, in, out, kps::IdentityFunctor<T>(), out_reduce_dims);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index ce6bb0d559c..5834f091d9a 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -1087,12 +1087,12 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const phi::DenseTensor& x,
-                      phi::DenseTensor* y,
-                      const TransformOp& transform,
-                      const std::vector<int>& origin_reduce_dims,
-                      KPStream stream) {
+void ReduceKernel(const phi::GPUContext& dev_ctx,
+                  const phi::DenseTensor& x,
+                  phi::DenseTensor* y,
+                  const TransformOp& transform,
+                  const std::vector<int>& origin_reduce_dims) {
+  auto stream = dev_ctx.stream();
   dev_ctx.Alloc<Ty>(y);
 
   auto x_dim = phi::vectorize<int>(x.dims());
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 926dffc7450..d4850b74477 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -87,13 +87,12 @@ void BroadcastTensorsGradKernel(const Context& ctx,
           *input_tensor, ctx.GetPlace(), ctx, output_tensor);
     } else {
       // reduce_sum implementation on CUDA
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx,
           *input_tensor,
           output_tensor,
           kps::IdentityFunctor<T>(),
-          reduce_dims_vec,
-          ctx.stream());
+          reduce_dims_vec);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu
index 9c02627e546..225164687b7 100644
--- a/paddle/phi/kernels/gpu/compare_kernel.cu
+++ b/paddle/phi/kernels/gpu/compare_kernel.cu
@@ -80,8 +80,8 @@ inline void CompareAllKernelImpl(const Context& ctx,
   for (int i = 0; i < reduce_dims.size(); ++i) {
     reduce_dims[i] = i;
   }
-  funcs::TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
-      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims, ctx.stream());
+  funcs::ReduceKernel<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
+      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index b356f19555f..98df65c92f3 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -29,13 +29,8 @@ void ReduceWrapper(const GPUContext &dev_ctx,
                    DenseTensor *dst) {
   std::vector<int> reduce_dims =
       funcs::GetReduceDim(dst->dims(), src->dims(), axis);
-  funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx,
-      *src,
-      dst,
-      kps::IdentityFunctor<T>(),
-      reduce_dims,
-      dev_ctx.stream());
+  funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -172,9 +167,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
   // dy
@@ -187,9 +181,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
 }
@@ -285,9 +278,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
   // dy
@@ -306,9 +298,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
-          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 0319de7558e..da5315f3447 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -39,8 +39,6 @@ void Reduce(const KPDevice& dev_ctx,
     reduce_num *= (x.dims())[i];
   }
 
-  KPStream stream = dev_ctx.stream();
-
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
     PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(
@@ -48,29 +46,23 @@ void Reduce(const KPDevice& dev_ctx,
         phi::DataType::INT64,
         phi::DataType::FLOAT16,
         out_dtype,
-        "TensorReduceImpl",
+        "ReduceKernel",
         ([&] {
           using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
-          phi::funcs::TensorReduceImpl<data_t,
-                                       data_t,
-                                       ReduceOp,
-                                       TransformOp<data_t, MPType>>(
+          phi::funcs::ReduceKernel<data_t,
+                                   data_t,
+                                   ReduceOp,
+                                   TransformOp<data_t, MPType>>(
               dev_ctx,
               tmp_tensor,
               out,
               TransformOp<data_t, MPType>(reduce_num),
-              reduce_dims,
-              stream);
+              reduce_dims);
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::funcs::TensorReduceImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-        dev_ctx,
-        x,
-        out,
-        TransformOp<T, MPType>(reduce_num),
-        reduce_dims,
-        stream);
+    phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
+        dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index 598b0138fb3..6fc65006ae2 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
   dev_ctx.template Alloc<T>(counts_tensor);
   counts_tensor->Resize(in_grad->dims());
 
-  int limit = in_grad->numel();
-  int blocks = NumBlocks(limit);
-  int threads = kNumCUDAThreads;
   std::vector<const DenseTensor *> ins = {&x, &label, &out_grad};
   std::vector<DenseTensor *> outs = {in_grad, counts_tensor};
   auto functor = SigmoidBwdFunctor<T>(ignore_index);
-  constexpr int Size = 2;
-  phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
       dev_ctx, ins, &outs, functor);
   if (normalize) {
-    T *counts = dev_ctx.template Alloc<T>(counts_tensor);
     DenseTensor *norm_tensor = new DenseTensor();
     norm_tensor->Resize({sizeof(T)});
     dev_ctx.template Alloc<T>(norm_tensor);
@@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
       reduce_dim.push_back(i);
     }
 
-    funcs::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-        dev_ctx,
-        *counts_tensor,
-        norm_tensor,
-        NonzeroFunctor<T>(),
-        reduce_dim,
-        dev_ctx.stream());
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
     T *norm = dev_ctx.template Alloc<T>(norm_tensor);
     auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
     T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
@@ -114,6 +104,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
     phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
     delete norm_tensor;
   }
+  delete counts_tensor;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index 13d63f8d97e..4b6e5628c72 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
   dev_ctx.template Alloc<T>(counts_tensor);
   counts_tensor->Resize(out->dims());
 
-  int limit = out->numel();
-  int blocks = NumBlocks(limit);
-  int threads = kNumCUDAThreads;
   std::vector<const DenseTensor *> ins = {&x, &label};
   std::vector<DenseTensor *> outs = {out, counts_tensor};
   auto functor = SigmoidFwdFunctor<T>(ignore_index);
-  constexpr int Size = 2;
-  phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
       dev_ctx, ins, &outs, functor);
   if (normalize) {
-    T *counts = dev_ctx.template Alloc<T>(counts_tensor);
     DenseTensor *norm_tensor = new DenseTensor();
     norm_tensor->Resize({sizeof(T)});
     dev_ctx.template Alloc<T>(norm_tensor);
@@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
       reduce_dim.push_back(i);
     }
 
-    funcs::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-        dev_ctx,
-        *counts_tensor,
-        norm_tensor,
-        NonzeroFunctor<T>(),
-        reduce_dim,
-        dev_ctx.stream());
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
     T *norm = dev_ctx.template Alloc<T>(norm_tensor);
     auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
     T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
@@ -114,8 +104,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
     phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
 
     delete norm_tensor;
-    delete counts_tensor;
   }
+  delete counts_tensor;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 4266f0174ff..4a749c5b334 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx,
   T* out_data = ctx.template Alloc<T>(out);
   auto diag = funcs::Diagonal<T, Context>(ctx, &x, offset, axis1, axis2);
   if (diag.numel() > 0) {
-    auto stream = ctx.stream();
     std::vector<int> reduce_dims;
     reduce_dims.push_back(out->dims().size());
-    funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims);
   } else {
     phi::funcs::SetConstant<Context, T> functor;
     functor(ctx, out, static_cast<T>(0));
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index d06bdc55030..495b93f2a4e 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   const DenseTensor& input,
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
-    auto stream = dev_ctx.stream();
-    funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims);
   }
 };
 #endif
-- 
GitLab


From e548f65f96697830035a28f9070b40829408ccdb Mon Sep 17 00:00:00 2001
From: Roc <30228238+sljlp@users.noreply.github.com>
Date: Tue, 8 Mar 2022 22:26:02 +0800
Subject: [PATCH 192/272] support ema optimizer in sharding optimizers (#39860)

---
 .../paddle/distributed/fleet/meta_optimizers/sharding/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index d04a3a53db3..b42f21989ab 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -901,9 +901,10 @@ def save_persistables(exe, dirname, main_program, filename=None):
     def is_opt_vars(var):
         # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
         # now only Momentum and adam are compatible with sharding
+        # support EMA optimizer
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
-            "_velocity_0"
+            "_velocity_0", "_ema_0"
         ]
         for check in checks:
             if var.name.endswith(check) and var.persistable:
-- 
GitLab


From fcae3430808576c6a143562410f2527cc793bc70 Mon Sep 17 00:00:00 2001
From: Yang <3349368+m3ngyang@users.noreply.github.com>
Date: Wed, 9 Mar 2022 10:10:55 +0800
Subject: [PATCH 193/272] fix take_along_axis cuda op register bug (#40270)

* fix take_along_axis cuda op register bug

* add comma after float

Co-authored-by: Chen Weihang <chenwhpro@163.com>
---
 paddle/phi/kernels/gpu/take_along_axis_kernel.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 63113e3e672..9665a917d9d 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -53,6 +53,7 @@ PD_REGISTER_KERNEL(take_along_axis,
                    GPU,
                    ALL_LAYOUT,
                    phi::TakeAlongAxisKernel,
+                   float,
                    double,
                    int64_t,
                    int,
-- 
GitLab


From fb4215b2d1765e305f687d2d1ca5f19c90f7eeb1 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Wed, 9 Mar 2022 10:21:50 +0800
Subject: [PATCH 194/272] fix batch_norm op kernel (#40171)

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 6ad12245d2a..49b550f51e6 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -460,10 +460,14 @@ void BatchNormKernel(const Context &ctx,
       void *reserve_space_ptr = nullptr;
       void *workspace_ptr = nullptr;
       DenseTensor workspace_tensor;
+      DenseTensor reserve_space_tensor;
       // Create reserve space and workspace for batch norm.
       // Create tensor for each batchnorm op, it will be used in the
       // backward. Thus this tensor shouldn't be temp.
       // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      if (reserve_space == nullptr) {
+        reserve_space = &reserve_space_tensor;
+      }
       PADDLE_ENFORCE_NOT_NULL(
           reserve_space,
           phi::errors::NotFound(
-- 
GitLab


From 8031a4dc8b05dcfee95af2ca613fc736fc7f9830 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 9 Mar 2022 10:27:30 +0800
Subject: [PATCH 195/272] [Phi] move Reduce max kernel into phi (#40225)

* add reduce_max kernel

* add reduce max kernel

* update reduce max Argumentmapping

* remove reduce_max kernel

* remove reduce_max kernel

* add reduce max infermeta

* rename reduce infermeta
---
 .../operators/reduce_ops/reduce_max_op.cc     | 31 ++++++++----
 .../operators/reduce_ops/reduce_max_op.cu     | 23 ---------
 .../operators/reduce_ops/reduce_mean_op.cc    |  2 +-
 .../operators/reduce_ops/reduce_sum_op.cc     |  2 +-
 paddle/phi/core/compat/op_utils.h             |  1 +
 paddle/phi/infermeta/unary.cc                 | 50 ++++++++++++-------
 paddle/phi/infermeta/unary.h                  | 22 ++++----
 paddle/phi/kernels/cpu/reduce_max_kernel.cc   | 39 +++++++++++++++
 paddle/phi/kernels/funcs/reduce_functor.h     |  8 +++
 paddle/phi/kernels/gpu/reduce_max_kernel.cu   | 37 ++++++++++++++
 paddle/phi/kernels/math_kernel.h              |  2 +-
 paddle/phi/kernels/reduce_max_kernel.cc       | 39 +++++++++++++++
 paddle/phi/kernels/reduce_max_kernel.h        | 38 ++++++++++++++
 paddle/phi/ops/compat/reduce_sig.cc           | 24 ++++++++-
 python/paddle/utils/code_gen/api.yaml         |  2 +-
 15 files changed, 252 insertions(+), 68 deletions(-)
 delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.cu
 create mode 100644 paddle/phi/kernels/cpu/reduce_max_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/reduce_max_kernel.cu
 create mode 100644 paddle/phi/kernels/reduce_max_kernel.cc
 create mode 100644 paddle/phi/kernels/reduce_max_kernel.h

diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index cb438b4a805..41df8e4a15f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -14,15 +14,28 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_max);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MaxFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMaxOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_max"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_max"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_max, ops::ReduceOp, ReduceMaxOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMaxInferShapeFunctor);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
+
 REGISTER_OP_CPU_KERNEL(
     reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                            float, ops::MaxOrMinGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
deleted file mode 100644
index 8194805ddc3..00000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-// reduce_max
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max,
-    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 894106883cb..4a183309138 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -97,7 +97,7 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker {
 };
 
 DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
-                            PD_INFER_META(phi::MeanRawInferMeta));
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
 REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
                   ops::ReduceMeanOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6559ed479c8..6441d53239e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -103,7 +103,7 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker {
 };
 
 DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
-                            PD_INFER_META(phi::ReduceInferMetaBase));
+                            PD_INFER_META(phi::SumRawInferMeta));
 
 REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumVarTypeInference,
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 9947e00ecb5..1ab718c0794 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -47,6 +47,7 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "max",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 17edc846187..32744659163 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -406,7 +406,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
   ReshapeInferMeta(x, shape, out, config);
 }
 
-/*  Why not use ReduceInferMetaBase directly?
+/*  Why not use SumRawInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
 void SumInferMeta(const MetaTensor& x,
@@ -415,15 +415,13 @@ void SumInferMeta(const MetaTensor& x,
                   bool keep_dim,
                   MetaTensor* out) {
   bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out);
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
 }
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType dtype,
-                         MetaTensor* out) {
+DDim ReduceInferDim(const MetaTensor& x,
+                    const std::vector<int64_t>& axis,
+                    bool keep_dim,
+                    bool reduce_all) {
   auto x_rank = x.dims().size();
 
   std::vector<int64_t> formated_axis = axis;
@@ -486,6 +484,17 @@ void ReduceInferMetaBase(const MetaTensor& x,
   }
   DDim out_dim = phi::make_ddim(out_dim_vector);
 
+  return out_dim;
+}
+
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+
   DataType out_dtype;
   if (dtype != DataType::UNDEFINED) {
     out_dtype = dtype;
@@ -503,20 +512,23 @@ void ReduceInferMetaBase(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
-void MeanRawInferMeta(const MetaTensor& x,
-                      const std::vector<int64_t>& axis,
-                      bool keep_dim,
-                      bool reduce_all,
-                      MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+  out->set_dims(out_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
 }
 
-void MeanInferMeta(const MetaTensor& x,
-                   const std::vector<int64_t>& axis,
-                   bool keep_dim,
-                   MetaTensor* out) {
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out) {
   bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
 }
 
 void TransferLayoutInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index dac7c19cf9b..735a77faefe 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -94,23 +94,23 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out);
+
 void ReduceInferMetaBase(const MetaTensor& x,
                          const std::vector<int64_t>& axis,
                          bool keep_dim,
                          bool reduce_all,
-                         DataType dtype,
                          MetaTensor* out);
 
-void MeanRawInferMeta(const MetaTensor& x,
-                      const std::vector<int64_t>& axis,
-                      bool keep_dim,
-                      bool reduce_all,
-                      MetaTensor* out);
-
-void MeanInferMeta(const MetaTensor& x,
-                   const std::vector<int64_t>& axis,
-                   bool keep_dim,
-                   MetaTensor* out);
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out);
 
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
new file mode 100644
index 00000000000..f9ea0aa0faf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index aebd155ac59..4e83d0fa371 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -41,5 +41,13 @@ struct ProdFunctor {
   }
 };
 
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
new file mode 100644
index 00000000000..98c3986c51d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index fe8f3b749cd..7569cbcff08 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx,
                  bool keep_dim) {
   DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
new file mode 100644
index 00000000000..de172a12d72
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h
new file mode 100644
index 00000000000..7560473d43c
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 92839fb3030..36798abe4c1 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "sum_raw" KernelSignature.
-    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // And the InferMeta function(i.e. SumRawInferMeta) is accordance with
     // the "sum_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature("sum_raw",
@@ -40,7 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
-    // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
     // "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
@@ -56,11 +57,30 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
       "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
 }
 
+KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
+    // "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 8c68ca4d7e0..6c27d465cb1 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -124,7 +124,7 @@
   args : (Tensor x, int64[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
-    func : MeanInferMeta
+    func : ReduceInferMeta
   kernel :
     func : mean
 
-- 
GitLab


From 041c4bca832ef342679b17783c67f5d7294b1f6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=20Wei=20=28=E4=BB=BB=E5=8D=AB=29?= <wadefelix@gmail.com>
Date: Wed, 9 Mar 2022 10:29:37 +0800
Subject: [PATCH 196/272] build documents if public apis modified, meanwhile
 their samplecodes should be tested (#39728)

* run document_preview when samplecodes be tested

* run document_preview when samplecodes be tested

* sphinx-build symbol link; and build-doc default

* FLUIDDOCDIR typo

* download the required configirations and some other scripts

* install required python packages.

* clone specified branch of docs repo, and if failed, clone the default branch

* clean workspace for docs repo

* use the conf.py imported by https://github.com/PaddlePaddle/docs/pull/4222/

* download and install the boscmd

* Optimaze the code comments.

* specify the pypi index server

* only do doc-build when running in cpu mode

* pull docs pr

git log

paddle_pr_info

* install jq

* force using sphinx-build under py3.7

* using our new domain name for preview

* install python package error

* don't build doc default
---
 tools/document_preview.sh | 170 ++++++++++++++++++++++++++++++++++----
 tools/sampcd_processor.py |  45 ++++++++++
 2 files changed, 198 insertions(+), 17 deletions(-)

diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 83c758d0aa8..424169bbc51 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,155 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-PADDLE_ROOT=/home
-mkdir ${PADDLE_ROOT}
-cd ${PADDLE_ROOT}
-pip install /paddle/build/opt/paddle/share/wheels/*.whl
-git clone https://github.com/PaddlePaddle/FluidDoc
-git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-cd  ${PADDLE_ROOT}/PaddlePaddle.org
-git reset 3feaa68376d8423e41d076814e901e6bf108c705
-cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api
-sh gen_doc.sh
-apt-get update && apt-get install -y python-dev build-essential
-cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
-pip install -r requirements.txt
-#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI.
-sed -i "s#8000#$1#g" runserver
-nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc &
+is_shell_attribute_set() { # attribute, like "x"
+  case "$-" in
+    *"$1"*) return 0 ;;
+    *)    return 1 ;;
+  esac
+}
+function get_docs_pr_num_from_paddle_pr_info(){
+    # get_repo_pr_info's output
+    pr_info_file=$1
+    if [ ! -r ${pr_info_file} ] ; then
+        return 1
+    fi
+
+    declare -A arr_kv
+    while read line
+    do
+        echo "$line" | grep '^\w\+\s*=\s*.*' > /dev/null
+        if [ $? = 0 ] ; then
+            kv=($(echo $line | sed 's/=/\n/g'))
+            k=($(echo "${kv[0]}" | sed 's/\s//g'))
+            v=($(echo "${kv[1]}" | sed 's/^\s*//g' | sed 's/\s*$//g'))
+            # arr_kv[${kv[1]}]=${kv[2]}
+            arr_kv[${k}]=${v}
+        fi
+    done < <(jq -r '.body' ${pr_info_file})
+
+    echo ${arr_kv[PADDLEDOCS_PR]}
+    return 0
+}
+
+# Attention:
+# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. 
+# 2. And /docs is used as the output of doc-build process.
+# 3. If conflicted with yours, please modify the defination of FLUIDDOCDIR and
+#    OUTPUTDIR in the subsequent codes.
+# 4. The doc-build process is controlled under EnvVar BUILD_DOC and UPLOAD_DOC.
+#    All the Chinese and English docs will be generated, and then uploaded.
+
+PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: None"
+BUILD_DOC=${BUILD_DOC:=false}
+UPLOAD_DOC=${UPLOAD_DOC:=false}
+
+CURPWD=${PWD}
+
+if [ -f /usr/local/python3.7.0/bin/sphinx-build ] ; then
+    if [ -f /usr/local/bin/sphinx-build ] ; then
+        rm /usr/local/bin/sphinx-build
+    fi
+    ln -s /usr/local/python3.7.0/bin/sphinx-build /usr/local/bin/sphinx-build
+fi
+
+if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
+    export FLUIDDOCDIR=${FLUIDDOCDIR:=/FluidDoc}
+    export OUTPUTDIR=${OUTPUTDIR:=/docs}
+    export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g')
+
+    if [ -d ${FLUIDDOCDIR} ] ; then
+        echo "${FLUIDDOCDIR} exists, git clone will be skipped, but git clean will be done."
+        cd ${FLUIDDOCDIR}
+        git reset --hard
+        git clean -dfx
+        cd ${CURPWD}
+    else
+        git clone -b ${BRANCH} --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        if [ ! "$?" = "0" ] ; then
+            git clone --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        fi
+    fi
+    if [ -d ${OUTPUTDIR} ] ; then
+        echo "$0: rm -rf ${OUTPUTDIR}"
+        rm -rf ${OUTPUTDIR}
+        mkdir -p ${OUTPUTDIR}
+    fi
+
+    # install requirements
+    export no_proxy=mirror.baidu.com,${no_proxy}
+    apt-get install -y --no-install-recommends doxygen jq
+    echo 'beautifulsoup4
+Markdown
+sphinx-sitemap
+sphinx-markdown-tables
+breathe
+exhale
+sphinx_design
+nbsphinx
+' >/tmp/doc-build.requirements && \
+    pip install --no-cache-dir -i https://mirror.baidu.com/pypi/simple -r /tmp/doc-build.requirements && \
+    rm /tmp/doc-build.requirements
+
+
+    source ${FLUIDDOCDIR}/ci_scripts/utils.sh
+    paddle_pr_info=$(get_repo_pr_info "PaddlePaddle/Paddle" ${GIT_PR_ID})
+    docs_pr_id=$(get_docs_pr_num_from_paddle_pr_info ${paddle_pr_info})
+    if [ -n "${docs_pr_id}" ] ; then
+        cd ${FLUIDDOCDIR}
+        git fetch --depth=1 origin pull/${docs_pr_id}/head
+        git checkout -b "pr${docs_pr_id}" FETCH_HEAD
+        git log --pretty=oneline -10
+    fi
+    echo "docs_pr_id=${docs_pr_id}"
+
+
+    # build doc
+    /bin/bash -x ${FLUIDDOCDIR}/ci_scripts/gendoc.sh
+    if [ $? -ne 0 ];then
+        echo 'gendoc error'
+        exit 1
+    fi
+
+    if [ "${UPLOAD_DOC}" = "true" ] ; then
+        curl -o /tmp/linux-bcecmd-0.3.0.zip https://sdk.bce.baidu.com/console-sdk/linux-bcecmd-0.3.0.zip && \
+        python -m zipfile -e /tmp/linux-bcecmd-0.3.0.zip /opt && \
+        chmod +x /opt/linux-bcecmd-0.3.0/bcecmd && \
+        rm /tmp/linux-bcecmd-0.3.0.zip && \
+        curl -o /tmp/boscmdconfig.tgz https://paddle-dev-tools-open.bj.bcebos.com/fluiddoc-preview/boscmdconfig.tgz && \
+        tar xzf /tmp/boscmdconfig.tgz -C /opt/linux-bcecmd-0.3.0/ && \
+        rm /tmp/boscmdconfig.tgz
+
+        # credentials file is empty, please build it if need.
+        BCECMD=/opt/linux-bcecmd-0.3.0/bcecmd
+        BCECMD_CONFIG=/opt/linux-bcecmd-0.3.0/boscmdconfig
+
+        is_shell_attribute_set x
+        xdebug_setted=$?
+        if [ $xdebug_setted ] ; then
+            set +x
+        fi
+        if [ -n "${BOS_CREDENTIAL_AK}" ] && [ -n "${BOS_CREDENTIAL_SK}" ] ; then
+            echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials
+            echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials
+        fi
+        if [ $xdebug_setted ] ; then
+            set -x
+        fi
+
+        PREVIEW_JOB_NAME="preview-paddle-pr-${GIT_PR_ID}"
+        BOSBUCKET=${BOSBUCKET:=paddle-site-web-dev}
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: http://${PREVIEW_JOB_NAME}.${PREVIEW_SITE:-paddle.run}/documentation/docs/zh/api/index_cn.html"
+    fi
+fi
+
+cd ${CURPWD}
+# print the preview url
+echo "${PREVIEW_URL_PROMPT}"
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d8cb70c9dd1..2d8692c5bc7 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -550,6 +550,42 @@ def get_incrementapi():
                 f.write('\n')
 
 
+def exec_gen_doc():
+    result = True
+    cmd = ["bash", "document_preview.sh"]
+    logger.info("----exec gen_doc----")
+    start_time = time.time()
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+    end_time = time.time()
+
+    if subprc.returncode != 0:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.error("----gen_doc error msg----")
+        logger.error(err)
+        logger.error("----exec gen_doc failed----")
+        result = False
+    else:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.info("----exec gen_doc success----")
+
+    for fn in [
+            '/docs/en/develop/index_en.html', '/docs/zh/develop/index_cn.html'
+    ]:
+        if os.path.exists(fn):
+            logger.info('%s exists.', fn)
+        else:
+            logger.error('%s not exists.', fn)
+
+    # msg is the returned code execution report
+    return result, msg, end_time - start_time
+
+
 arguments = [
     # flags, dest, type, default, help
     ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
@@ -570,6 +606,11 @@ def parse_args():
     parser.add_argument('--debug', dest='debug', action="store_true")
     parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    parser.add_argument(
+        '--build-doc',
+        dest='build_doc',
+        action='store_true',
+        help='build doc if need.')
     for item in arguments:
         parser.add_argument(
             item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
@@ -702,3 +743,7 @@ if __name__ == '__main__':
             exit(1)
 
     logger.info("Sample code check is successful!")
+
+    if args.mode == "cpu":
+        # As cpu mode is also run with the GPU whl, so skip it in gpu mode.
+        exec_gen_doc()
-- 
GitLab


From b5a8a0d96b594ae16ae95b645aa38e3bbc78ec76 Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Wed, 9 Mar 2022 11:22:21 +0800
Subject: [PATCH 197/272] [MLU] add mlu buffer reader (#40131)

---
 .../fluid/operators/reader/buffered_reader.cc | 68 +++++++++++++++++++
 .../fluid/operators/reader/buffered_reader.h  | 12 ++++
 .../fluid/platform/stream_callback_manager.cc |  8 +--
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 21c23a7f602..4b6759ea165 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -70,9 +70,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  if (platform::is_mlu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::MluEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
+  mlu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::NPUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+    if (platform::is_mlu_place(place_)) {
+      TensorVec &mlu = mlu_buffer_[i];
+      if (mlu.empty()) {
+        mlu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mlu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on MLU and CPU devices are not matched. "
+                "The number on MLU is %d, on CPU is %d",
+                mlu.size(), cpu.size()));
+      }
+
+      std::vector<void *> mlu_ptrs;
+      mlu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        mlu[i].Resize(cpu[i].dims());
+        mlu[i].set_layout(cpu[i].layout());
+        mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetMLUDeviceId(place_.device);
+      PADDLE_ENFORCE_MLU_SUCCESS(
+          cnPlaceNotifier(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto mlu_ptr = mlu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        if ((platform::is_mlu_place(cpu_place))) {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+        } else {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+          platform::MLUStreamSync(stream_.get());
+        }
+        mlu[i].set_lod(cpu[i].lod());
+      }
+      platform::MLUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(cuda_buffer_[i]);
   } else if (platform::is_npu_place(place_)) {
     *out = std::move(npu_buffer_[i]);
+  } else if (platform::is_mlu_place(place_)) {
+    *out = std::move(mlu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 3d42486c6df..f0f3b6b7f9f 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -29,6 +29,11 @@
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
+  std::vector<TensorVec> mlu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::NpuStreamObject> stream_;
   std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  mluStream compute_stream_;
+  std::shared_ptr<platform::MluStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::MluEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 7fce0296d43..7148afee273 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,10 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
-  LOG(ERROR) << "failed to call MLULaunchCallback, "
-             << "because mlu not support StreamAddCallback yet. "
-             << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_
+          << " Failed to call MLULaunchCallback, "
+          << "because mlu not support StreamAddCallback yet. "
+          << "function: " << func;
 #endif
 }
 
-- 
GitLab


From 86effa0ce1309ea27f29af6a28dd5bb3d4aa1ac5 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 9 Mar 2022 11:23:02 +0800
Subject: [PATCH 198/272] [IPU] update ipu unittests p3 (#40072)

* update ipu UTs part3

* rename uts

* sync api changes

* update uts for new api

* update use_ipumodel()

* split pr
---
 .../unittests/ipu/test_matmul_v2_op_ipu.py    | 186 ++++++++++++++++++
 .../tests/unittests/ipu/test_mean_op_ipu.py   | 109 ++++------
 ...pipeline.py => test_model_pipeline_ipu.py} |  16 +-
 .../tests/unittests/ipu/test_mul_op_ipu.py    | 112 +++++------
 .../unittests/ipu/test_pool_avg_op_ipu.py     |  84 ++++----
 .../unittests/ipu/test_pool_max_op_ipu.py     | 128 ++++++------
 .../tests/unittests/ipu/test_pow_op_ipu.py    | 140 +++++++------
 .../tests/unittests/ipu/test_print_op_ipu.py  | 143 ++++++++++++++
 .../unittests/ipu/test_reduce_x_op_ipu.py     | 124 ++++++------
 .../ipu/test_reshape_inplace_op_ipu.py        |  84 ++++----
 .../unittests/ipu/test_reshape_op_ipu.py      |  83 ++++----
 ...est_save_load.py => test_save_load_ipu.py} | 105 +++++++---
 .../tests/unittests/ipu/test_scale_op_ipu.py  | 152 +++++++-------
 13 files changed, 934 insertions(+), 532 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_model_pipeline.py => test_model_pipeline_ipu.py} (86%)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_save_load.py => test_save_load_ipu.py} (58%)

diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
new file mode 100644
index 00000000000..9f1c115403a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -0,0 +1,186 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.matmul(x, y, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_x": True,
+            "transpose_y": True,
+        }
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 2, 3])
+        y = np.random.uniform(size=[5, 4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase4(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase6(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("not supported")
+class TestCase6_2(TestCase6):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": True, "transpose_y": True}
+
+
+class TestCase7(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("dim > 4 is not supported")
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
+            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index f04d712755d..b9dd7358b79 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,97 +26,79 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                out = paddle.mean(x, **self.attrs)
 
-                fetch_list = [out.name]
+                out = paddle.fluid.layers.mean(x)
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-class TestCase1(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 1
-        self.attrs['keepdim'] = False
-
-
-class TestCase2(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = False
-
-
-class TestCase3(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = True
-
-
-class TestCase4(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = True
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
similarity index 86%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
rename to python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
index e1ed7603ed6..7e702399640 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@ SEED = 2021
                  "core is not compiled with IPU")
 class TestCastNet(unittest.TestCase):
     def _test(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -37,14 +36,14 @@ class TestCastNet(unittest.TestCase):
 
         np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[1, 3, 10, 10], dtype='float32')
-                with fluid.ipu_shard(ipu_index=0):
+                with paddle.static.ipu_shard_guard(index=0):
                     conv1 = paddle.static.nn.conv2d(
                         image, num_filters=3, filter_size=3, bias_attr=False)
-                with fluid.ipu_shard(ipu_index=1):
+                with paddle.static.ipu_shard_guard(index=1):
                     conv2 = paddle.static.nn.conv2d(
                         conv1, num_filters=3, filter_size=3, bias_attr=False)
                     loss = paddle.mean(conv2)
@@ -60,9 +59,10 @@ class TestCastNet(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     num_ipus=2, is_training=False, enable_manual_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_pipelining_config(enable_pipelining=False)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 78a2589d9ac..7a9135626df 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,90 +26,98 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 1,
             "y_num_col_dims": 1,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.mul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 2,
             "y_num_col_dims": 1,
@@ -123,13 +125,13 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 4, 2, 9]).astype('float32'),
-            "y": np.random.uniform(size=[3, 6, 1, 2, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 9])
+        y = np.random.uniform(size=[3, 6, 1, 2, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index e81591ad683..4288b82832e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'avg',
@@ -60,53 +56,59 @@ class TestBase(IPUOpTest):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index a7c45c6686f..911a163b8aa 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'max',
@@ -60,120 +56,126 @@ class TestBase(IPUOpTest):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = 3
 
 
 class TestCase1_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = [3, 1]
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = 2
 
 
 class TestCase2_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = [2, 1]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1]
 
 
 class TestCase3_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1, 2, 2]
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'VALID'
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'SAME'
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['global_pooling'] = True
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['ceil_mode'] = True
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['exclusive'] = False
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 5059de7ba77..b3562d722c4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,124 +26,146 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"factor": 2.0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.array([2.0]).astype('float32'),
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 2, 2])
+        data2 = np.array([2.0])
+
+        self.feed_fp32 = {
+            "x": data1.astype(np.float32),
+            "y": data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": data1.astype(np.float16),
+            "y": data2.astype(np.float16)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 factor = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
new file mode 100644
index 00000000000..c9454e5945f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -0,0 +1,143 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype=self.feed_dtype[0])
+                out = paddle.fluid.layers.conv2d(
+                    x, num_filters=3, filter_size=3)
+                out = paddle.fluid.layers.Print(out, **self.attrs)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=self.feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1.flatten(), atol=self.atol))
+
+        self.assertTrue(res0.shape == res1.shape)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"message": "input_data"}
+
+
+class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        # "forward" : print forward
+        # "backward" : print forward and backward
+        # "both": print forward and backward
+        self.attrs = {"message": "input_data2", "print_phase": "both"}
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 2
+
+
+@unittest.skip("attrs are not supported")
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "first_n": 10,
+            "summarize": 10,
+            "print_tensor_name": True,
+            "print_tensor_type": True,
+            "print_tensor_shape": True,
+            "print_tensor_layout": True,
+            "print_tensor_lod": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index ac8ad08e8b2..929ee51b650 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,137 @@ class TestMean(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_mean
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = self.op(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-    def set_feed0(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 4]).astype(np.float32)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+    def set_data_feed0(self):
+        data = np.random.uniform(size=[2, 4])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_feed1(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 2, 2]).astype(np.float32)
+    def set_data_feed1(self):
+        data = np.random.uniform(size=[2, 2, 2])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_attr0(self):
+    def set_op_attr0(self):
         self.attrs = {}
         self.attrs['dim'] = None
         self.attrs['keep_dim'] = False
 
     def test_case0(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.run_test_base()
 
     def test_case1(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 0
         self.run_test_base()
 
     def test_case2(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = -1
         self.run_test_base()
 
     def test_case3(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 1
         self.run_test_base()
 
     def test_case4(self):
-        self.set_feed0()
+        self.set_data_feed0()
         self.attrs = {}
         self.attrs['dim'] = 1
         self.attrs['keep_dim'] = True
         self.run_test_base()
 
     def test_case5(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [1, 2]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case6(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case7(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = True
@@ -158,22 +164,22 @@ class TestMean(IPUOpTest):
 
 
 class TestMax(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_max
 
 
 class TestMin(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_min
 
 
 class TestProd(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_prod
 
 
 class TestSum(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_sum
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index f312b7b69ad..9ddf5c7537f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 add = paddle.fluid.layers.elementwise_add(x, x)
                 out = paddle.fluid.layers.reshape(add, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 5163838bc0c..11977193170 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,82 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([2, 4, 6])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [6, 8]
         self.attrs['inplace'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.reshape(x=x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [2, 3, -1, 2]
         self.attrs['inplace'] = False
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [-1, 0, 3, 2]
         self.attrs['inplace'] = False
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
similarity index 58%
rename from python/paddle/fluid/tests/unittests/ipu/test_save_load.py
rename to python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 24bb8e11184..3a694873062 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -12,55 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
     def _test_base(self, save_otherwise_load):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -91,12 +88,17 @@ class TestBase(IPUOpTest):
                 exe.run(startup_prog)
 
                 if not save_otherwise_load:
-                    paddle.static.load(main_prog, "model/model")
+                    paddle.static.load(main_prog, self.attrs['model_path'].name)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_precision_config(
+                    enable_fp16=self.attrs['enable_fp16'])
+                ipu_strategy.set_options({
+                    'save_per_n_step': self.attrs['save_at_step']
+                })
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -104,16 +106,17 @@ class TestBase(IPUOpTest):
                 run_steps = self.attrs['steps'] if save_otherwise_load \
                     else self.attrs['steps'] - self.attrs['save_at_step']
 
+                feed = self.feed_fp16 if self.attrs[
+                    'enable_fp16'] else self.feed_fp32
                 for i in range(run_steps):
-                    tmp = exe.run(program,
-                                  feed=self.feed,
-                                  fetch_list=fetch_list)
+                    tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
                     # currently, we update opt state every sess.run,
                     # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
-                        paddle.static.save(main_prog, "model/model")
+                        paddle.static.save(main_prog,
+                                           self.attrs['model_path'].name)
 
                     if save_otherwise_load and i >= self.attrs['save_at_step']:
                         result.append(tmp)
@@ -129,25 +132,65 @@ class TestBase(IPUOpTest):
         self.assertTrue(
             np.allclose(
                 res0.flatten(), res1.flatten(), atol=self.atol))
-        shutil.rmtree("model", True)
+        self.attrs['model_path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestSGDFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestAdamFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestLambFP16(TestBase):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 6ad2a89a738..49714eba8d4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 5.0,
             "bias": 0.0,
@@ -114,7 +116,7 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.5,
@@ -123,7 +125,16 @@ class TestCase2(TestBase):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": 5.0,
+            "bias": 0.7,
+            "bias_after_scale": True,
+        }
+
+
+class TestCase4(TestBase):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
@@ -131,59 +142,66 @@ class TestCase3(TestBase):
         }
 
 
-class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 3, 10, 10]).astype('float32'),
-            "y": np.array([3.0]).astype('float32'),
-        }
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 3, 10, 10])
+        y = np.array([3.0])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
-- 
GitLab


From fe765cb34e5a3970119f73472ab8cdd250924f11 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 9 Mar 2022 11:23:12 +0800
Subject: [PATCH 199/272] [IPU] update ipu unittests p1 (#39923)

* update ipu UTs part1

* rename ut

* sync api changes

* update uts for new api

* update use_ipumodel()

* update use_ipumodel()

* split pr
---
 .../unittests/ipu/test_dropout_op_ipu.py      |  88 +++++-----
 .../unittests/ipu/test_elemetwise_x_op_ipu.py | 150 +++++++++-------
 .../tests/unittests/ipu/test_equal_op_ipu.py  | 114 +++++++------
 .../tests/unittests/ipu/test_expand_op_ipu.py | 135 ++++++++-------
 .../ipu/test_fill_any_like_op_ipu.py          | 111 ++++++++++++
 .../ipu/test_fill_constant_op_ipu.py          |  68 ++++----
 .../ipu/test_fp16_inference_io_ipu.py         | 160 ++++++++++++++++++
 .../tests/unittests/ipu/test_gather_op_ipu.py |  97 +++++------
 .../tests/unittests/ipu/test_gelu_op_ipu.py   |  93 +++++-----
 .../unittests/ipu/test_greater_op_ipu.py      | 140 +++++++++++++++
 .../unittests/ipu/test_groupnorm_op_ipu.py    | 112 ++++++------
 ...l_io.py => test_inference_model_io_ipu.py} |  71 ++++----
 .../unittests/ipu/test_instancenorm_op_ipu.py | 104 ++++++------
 13 files changed, 960 insertions(+), 483 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_inference_model_io.py => test_inference_model_io_ipu.py} (78%)

diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index 8b1560edfd8..e34da7f7016 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
             "dropout_implementation": "downgrade_in_infer"
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 dropout = paddle.fluid.layers.dropout(x, **self.attrs)
                 out = paddle.fluid.layers.elementwise_add(dropout, dropout)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
@@ -115,7 +117,7 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.0,
             "is_test": False,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index 07b06d77c90..a9d6d230832 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,101 +27,136 @@ class TestMul(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        if IPUOpTest.use_ipumodel():
+            return False
+        else:
+            return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mul
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = self.op(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
     def test_case0(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2, 3, 4, 5))
+
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.attrs = {}
         self.set_feed_attr()
         self.run_test_base()
 
     def test_case1(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(3, 4)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(3, 4))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 1}
         self.run_test_base()
 
     def test_case2(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(5))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": -1}
         self.run_test_base()
 
     def test_case3(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 0}
@@ -134,37 +164,43 @@ class TestMul(IPUOpTest):
 
 
 class TestAdd(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_add
 
 
 class TestSub(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_sub
 
 
 class TestDiv(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_div
 
 
 class TestMin(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_min
 
 
 class TestMax(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_max
 
 
 class TestPow(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_pow
 
 
 class TestMod(TestMul):
-    def init_op(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mod
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index c319894bfae..5b18c738513 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,94 +26,106 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.zeros([1, 10]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                # XX
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.equal(x, y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.ones([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.arange(0, 10).reshape([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.arange(0, 10).reshape([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 5b7ea61568e..966dfdef87b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,142 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 3, 1]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"expand_times": [1, 2, 2]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
+
                 out = paddle.fluid.layers.expand(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 2]).astype('float32')}
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                expand_times = fluid.layers.fill_constant(
+                    dtype="float32")
+
+                expand_times = paddle.fluid.layers.fill_constant(
                     shape=[len(self.feed_shape[0])], dtype="int32", value=2)
                 out = paddle.fluid.layers.expand(
                     x, expand_times=expand_times, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
new file mode 100644
index 00000000000..00b855a5a7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 0.3, 'dtype': 'float32'}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                x_fill = paddle.full_like(x, **self.attrs)
+                out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 3, 'dtype': 'int32'}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index c62e0c08f9c..3a1c202bf11 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,21 +26,23 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
@@ -54,33 +50,34 @@ class TestBase(IPUOpTest):
             'value': 0.3,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.fluid.layers.fill_constant(**self.attrs)
                 out = paddle.fluid.layers.elementwise_add(x, x)
-
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -89,19 +86,18 @@ class TestBase(IPUOpTest):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
new file mode 100644
index 00000000000..cd29ff705b8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import shutil
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['path'] = 'model'
+        self.attrs['model_name'] = 'test'
+
+    def _test_save(self):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
+        self.full_name = '/'.join(
+            [self.attrs['path'], self.attrs['model_name']])
+
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
+                with paddle.static.program_guard(main_prog, startup_prog):
+                    x = paddle.static.data(
+                        name=self.feed_list[0],
+                        shape=self.feed_shape[0],
+                        dtype='float32')
+
+                    scale = paddle.fluid.layers.scale(
+                        x, scale=1.0, bias=0.0, bias_after_scale=True)
+                    conv = paddle.static.nn.conv2d(
+                        scale,
+                        num_filters=3,
+                        filter_size=3,
+                        bias_attr=False,
+                        name='conv2d')
+                    loss = paddle.mean(conv)
+
+                    if self.attrs['is_training']:
+                        if self.attrs['opt_type'] == 'sgd':
+                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
+                            sgd.minimize(loss)
+                        elif self.attrs['opt_type'] == 'adam':
+                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                            adam.minimize(loss)
+                        elif self.attrs['opt_type'] == 'lamb':
+                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
+                            lamb.minimize(loss)
+
+                fetch_list = [loss.name]
+
+                place = paddle.IPUPlace()
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(
+                        self.feed_list, fetch_list)
+
+                for _ in range(self.attrs['steps']):
+                    exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list)
+
+                paddle.static.save_inference_model(
+                    self.full_name, x, loss, exe, program=program.org_program)
+
+    def _test_load(self, run_ipu):
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+
+        [inference_program, feed_target_names, fetch_targets] = (
+            paddle.static.load_inference_model(self.full_name, exe))
+
+        if run_ipu:
+            feed_list = feed_target_names
+            fetch_list = [fetch_targets[0].name]
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=False)
+            ipu_strategy.set_precision_config(enable_fp16=True)
+            program = paddle.static.IpuCompiledProgram(
+                inference_program,
+                ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+        else:
+            program = inference_program
+
+        feed = self.feed_fp16 if run_ipu else self.feed_fp32
+        result = []
+        for i in range(10):
+            feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype)
+            out = exe.run(program, feed=feed, fetch_list=[fetch_targets])
+            result.append(out)
+
+        return np.array(result)
+
+    def test_base(self):
+        self._test_save()
+        cpu_res = self._test_load(False)
+        ipu_res = self._test_load(True).astype(np.float32)
+
+        self.assertTrue(
+            np.allclose(
+                cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16))
+
+        shutil.rmtree(self.attrs['path'], True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index d5be8ae0cf7..01a56fd14be 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[10, 20]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[10, 20])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
+
                 out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[100]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[100])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index ca8c0935d78..602289f3f19 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_atol(self):
-        self.atol = 1e-3
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"approximate": False}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.gelu(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
-@unittest.skip('approximate=True is not supported')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 2e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {"approximate": True}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
new file mode 100644
index 00000000000..05a37dcb3d5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -0,0 +1,140 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.randn(3, 4, 5)
+        y = np.random.randn(3, 4, 5)
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.greater_than(x, y, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.zeros([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index eb644c2c667..102e764cb2f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,43 +26,49 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 8, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 8, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 8,
             "epsilon": 1e-05,
             "data_layout": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -78,62 +78,68 @@ class TestBase(IPUOpTest):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.group_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.group_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -147,11 +153,15 @@ class TestTrainCase1(TestBase):
         self.epoch = 10
 
 
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 7e-4
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -163,7 +173,5 @@ class TestTrainCase2(TestBase):
         self.epoch = 10
 
 
-# not support `group_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
similarity index 78%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
rename to python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
index 0a331d80454..33a63a80e3b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
@@ -12,59 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed = {"in_0": data.astype(np.float32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
     def _test_save(self):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
         self.full_name = '/'.join(
-            [self.attrs['path'], self.attrs['model_name']])
+            [self.attrs['path'].name, self.attrs['model_name']])
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -88,16 +88,16 @@ class TestBase(IPUOpTest):
                         elif self.attrs['opt_type'] == 'lamb':
                             lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
                             lamb.minimize(loss)
-                    fetch_list = [loss.name]
+                fetch_list = [loss.name]
 
                 place = paddle.IPUPlace()
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -125,8 +125,8 @@ class TestBase(IPUOpTest):
             feed_list = feed_target_names
             fetch_list = [fetch_targets[0].name]
             ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(is_training=False)
-            program = compiler.IPUCompiledProgram(
+            ipu_strategy.set_graph_config(is_training=False)
+            program = paddle.static.IpuCompiledProgram(
                 inference_program,
                 ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
         else:
@@ -134,7 +134,7 @@ class TestBase(IPUOpTest):
 
         tmp = exe.run(program, feed=self.feed, fetch_list=[fetch_targets])
 
-        return tmp
+        return np.array(tmp)
 
     def test_base(self):
         self._test_save()
@@ -142,27 +142,26 @@ class TestBase(IPUOpTest):
         ipu_res = self._test_load(True)
 
         self.assertTrue(np.allclose(cpu_res, ipu_res, atol=self.atol))
-
-        shutil.rmtree(self.attrs['path'], True)
+        self.attrs['path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index ee9cd875cf2..ed8f3950ace 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,39 +26,45 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"epsilon": 1e-05}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -74,58 +74,64 @@ class TestBase(IPUOpTest):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.instance_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.instance_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res)
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
@@ -134,7 +140,5 @@ class TestTrainCase1(TestBase):
         self.epoch = 10
 
 
-# not support `instance_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 0b597754e27113129e9969e6be8d2a588def032e Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 9 Mar 2022 11:39:41 +0800
Subject: [PATCH 200/272] add ipu uts (#40205)

---
 .../unittests/ipu/test_flatten_op_ipu.py      | 118 +++++++++++++
 .../tests/unittests/ipu/test_optimizer_ipu.py | 165 ++++++++++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py

diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
new file mode 100644
index 00000000000..6f0cafc6680
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -0,0 +1,118 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 1
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.flatten(x=x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
+
+        self.check(output_dict, check_shape=True)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 0
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
new file mode 100644
index 00000000000..1cc10da3d73
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -0,0 +1,165 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+
+    def set_data_feed(self):
+        self.feed = {
+            "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.0,
+            "loss_scaling": 1.0,
+        }
+
+    def _test_optimizer(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        np.random.seed(self.SEED)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(
+                    name='image', shape=[1, 3, 10, 10], dtype='float32')
+                conv1 = paddle.static.nn.conv2d(
+                    image, num_filters=3, filter_size=3, bias_attr=False)
+                loss = paddle.mean(conv1)
+
+                weight_decay = self.attrs['weight_decay']
+                opt = paddle.optimizer.SGD(learning_rate=1e-1,
+                                           weight_decay=weight_decay)
+                if self.attrs['optimizer'] == 'adam':
+                    opt = paddle.optimizer.Adam(
+                        learning_rate=1e-1, weight_decay=weight_decay)
+                elif self.attrs['optimizer'] == 'lamb':
+
+                    opt = paddle.optimizer.Lamb(
+                        learning_rate=1e-1, lamb_weight_decay=weight_decay)
+                opt.minimize(loss)
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = [image.name]
+                fetch_list = [loss.name]
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
+                                                                  fetch_list)
+            else:
+                program = main_prog
+
+            result = []
+            for epoch in range(100):
+                loss_res = exe.run(program, feed=self.feed, fetch_list=[loss])
+                result.append(loss_res)
+
+            return np.array(result)
+
+    def test(self):
+        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
+        ipu_loss = self._test_optimizer(True).flatten()
+        cpu_loss = self._test_optimizer(False).flatten()
+
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol))
+
+
+@unittest.skip('do not support L2 regularization')
+class TestSGD(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.1,
+            "loss_scaling": 2.0,
+        }
+
+
+@unittest.skip('do not support L2 regularization')
+class TestAdamCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.1,
+            "loss_scaling": 3.0,
+        }
+
+
+class TestAdamCase2(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLambCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "loss_scaling": 5.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLamb(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 2037fa68db8a79ff4869afcf0ce6864d7e05449f Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Wed, 9 Mar 2022 11:49:44 +0800
Subject: [PATCH 201/272] [optest]: fix transpose, support  different parameter
 name between python_api and KernelSignature. (#40258)

* optest: fix transpose

* fix
---
 .../paddle/fluid/tests/unittests/op_test.py   | 75 ++++++++++++++-----
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 6455da92475..457f20ac5b0 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -50,6 +50,7 @@ from paddle.fluid.tests.unittests.white_list import (
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 
 def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
@@ -698,19 +699,55 @@ class OpTest(unittest.TestCase):
                 self.__class__.__name__)
 
     def _calc_python_api_output(self, place):
-        def prepare_python_api_arguments(op_proto_ins, op_proto_attrs,
+        def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs,
                                          kernel_sig):
             """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
             """
+
+            class Empty:
+                pass
+
+            def is_empty(a):
+                return isinstance(a, Empty)
+
+            def get_default(idx, all_params_number, defaults):
+                related_idx = idx - all_params_number + len(defaults)
+                assert related_idx >= 0, "%d-th arguments don't have default value" % idx
+                return defaults[related_idx]
+
+            def remove_name(x):
+                if isinstance(x, list): return [i for i in x if i != 'name']
+                if isinstance(x, dict):
+                    return {k: v for k, v in x.items() if k != 'name'}
+                assert False, "Only support list or dict."
+
+            def to_defaults_list(params, defaults):
+                return [defaults[p] for p in params if p in defaults]
+
             # NOTE(xiongkun): why don't use input arguments dicts ? 
             # Because we don't know the python api name of each arguments.
+            # using parse_arg_and_kwargs, we can get the all api information we need.
+            api_params, api_defaults = [
+                remove_name(item) for item in parse_arg_and_kwargs(api)
+            ]
+            api_defaults = to_defaults_list(api_params, api_defaults)
             inputs_sig, attrs_sig, outputs_sig = kernel_sig
-            input_arguments = [op_proto_ins[name] for name in inputs_sig]
-            attr_arguments = {
-                name: op_proto_attrs[name]
-                for name in attrs_sig if name in op_proto_attrs
-            }
-            return input_arguments, attr_arguments
+            inputs_and_attrs = inputs_sig + attrs_sig
+            assert (
+                len(api_params) == len(inputs_and_attrs)
+            ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)"
+            input_arguments = [op_proto_ins[name] for name in inputs_sig] + [
+                op_proto_attrs[name] if name in op_proto_attrs else Empty()
+                for name in attrs_sig
+            ]
+            results = []
+            for idx, arg in enumerate(input_arguments):
+                if is_empty(arg):
+                    results.append(
+                        get_default(idx, len(input_arguments), api_defaults))
+                else:
+                    results.append(arg)
+            return results
 
         def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
             if not isinstance(ret_tuple, (tuple, list)):
@@ -720,25 +757,27 @@ class OpTest(unittest.TestCase):
                     len(output_sig), len(ret_tuple))
             return {a: b for a, b in zip(output_sig, ret_tuple)}
 
-        def assumption_assert_and_transform(args, argvs):
+        def assumption_assert_and_transform(args, inp_num):
             """
-            transform by the following rules:
+            transform inputs by the following rules:
                 1. [Tensor] -> Tensor
                 2. [Tensor, Tensor, ...] -> list of Tensors
 
             only support "X" is list of Tensor, currently don't support other structure like dict.
             """
-            for inp in args:
+            for inp in args[:inp_num]:
                 assert isinstance(
                     inp, list
                 ), "currently only support `X` is [Tensor], don't support other structure."
-            args = [inp[0] if len(inp) == 1 else inp for inp in args]
-            return args, argvs
+            args = [
+                inp[0] if len(inp) == 1 else inp for inp in args[:inp_num]
+            ] + args[inp_num:]
+            return args
 
-        def cal_python_api(python_api, args, argvs, kernel_sig):
-            args, argvs = assumption_assert_and_transform(args, argvs)
+        def cal_python_api(python_api, args, kernel_sig):
             inputs_sig, attrs_sig, outputs_sig = kernel_sig
-            ret_tuple = python_api(*args, **argvs)
+            args = assumption_assert_and_transform(args, len(inputs_sig))
+            ret_tuple = python_api(*args)
             return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
 
         with fluid.dygraph.base.guard(place=place):
@@ -764,11 +803,11 @@ class OpTest(unittest.TestCase):
             assert hasattr(
                 self, "python_api"
             ), "Please set the `self.python_api` if you want to compare python api output."
-            arg, argv = prepare_python_api_arguments(inputs, attrs_outputs,
-                                                     kernel_sig)
+            args = prepare_python_api_arguments(self.python_api, inputs,
+                                                attrs_outputs, kernel_sig)
             """ we directly return the cal_python_api value because the value is already tensor. 
             """
-            return cal_python_api(self.python_api, arg, argv, kernel_sig)
+            return cal_python_api(self.python_api, args, kernel_sig)
 
     def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
-- 
GitLab


From 9968c56321a74c51fb762cb583f80bac6de90e6f Mon Sep 17 00:00:00 2001
From: chenenquan <chenenquan@baidu.com>
Date: Wed, 9 Mar 2022 11:53:36 +0800
Subject: [PATCH 202/272] [Phi] Migrate linspace op to phi (#40124)

* [Phi] Migrate linspace op

* [Phi] Migrate linspace op

* [Phi] Fix linspace op

* [PHI] rename data_tranform to data_type_transform

* [PHI] Fix DECLARE and PD
---
 paddle/fluid/operators/linspace_op.cc         |  45 ++------
 paddle/fluid/operators/linspace_op.cu         | 104 ------------------
 paddle/fluid/operators/linspace_op.h          |  76 -------------
 paddle/phi/infermeta/ternary.cc               |  29 +++++
 paddle/phi/infermeta/ternary.h                |   5 +
 paddle/phi/kernels/cpu/linspace_kernel.cc     |  71 ++++++++++++
 .../phi/kernels/funcs/data_type_transform.h   |  58 ++++++++++
 paddle/phi/kernels/gpu/linspace_kernel.cu     |  97 ++++++++++++++++
 paddle/phi/kernels/linspace_kernel.h          |  26 +++++
 9 files changed, 298 insertions(+), 213 deletions(-)
 delete mode 100644 paddle/fluid/operators/linspace_op.cu
 delete mode 100644 paddle/fluid/operators/linspace_op.h
 create mode 100644 paddle/phi/kernels/cpu/linspace_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/data_type_transform.h
 create mode 100644 paddle/phi/kernels/gpu/linspace_kernel.cu
 create mode 100644 paddle/phi/kernels/linspace_kernel.h

diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index fe271fa5e89..378c7573d61 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/linspace_op.h"
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace");
-
-    auto s_dims = ctx->GetInputDim("Start");
-    PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Start) must be [1],"
-                          "but received input shape is [%s].",
-                          s_dims));
-    auto e_dims = ctx->GetInputDim("Stop");
-    PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Stop) must be [1],"
-                          "but received input shape is [%s].",
-                          e_dims));
-    auto step_dims = ctx->GetInputDim("Num");
-    PADDLE_ENFORCE_EQ(
-        (step_dims.size() == 1) && (step_dims[0] == 1), true,
-        platform::errors::InvalidArgument("The shape of Input(Num) must be [1],"
-                                          "but received input shape is [%s].",
-                                          step_dims));
-    ctx->SetOutputDim("Out", {-1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
-REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
-                       ops::CPULinspaceKernel<int32_t>,
-                       ops::CPULinspaceKernel<int64_t>,
-                       ops::CPULinspaceKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor,
+                            PD_INFER_META(phi::LinspaceInferMeta));
+REGISTER_OPERATOR(
+    linspace, ops::LinspaceOp, ops::LinspaceOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    LinspaceInferShapeFunctor);
 
 REGISTER_OP_VERSION(linspace)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
deleted file mode 100644
index aa625a7f5b9..00000000000
--- a/paddle/fluid/operators/linspace_op.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
-                               T* out) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; index < size; index += blockDim.x * gridDim.x) {
-    if (index < size / 2) {
-      out[index] = static_cast<T>(start + step * index);
-    } else {
-      out[index] = static_cast<T>(stop - step * (size - index - 1));
-    }
-  }
-}
-
-template <typename T>
-__global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = static_cast<T>(start);
-}
-
-template <typename T>
-class CUDALinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    auto* num_t = context.Input<framework::Tensor>("Num");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    framework::Tensor n_start;
-    framework::Tensor n_stop;
-    framework::Tensor n_num;
-    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
-    T start = n_start.data<T>()[0];
-    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
-    T stop = n_stop.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
-    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
-
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    double step = 0;
-    auto stream = context.cuda_device_context().stream();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
-                                                    out_data);
-    } else {
-      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
-                        ops::CUDALinspaceKernel<int32_t>,
-                        ops::CUDALinspaceKernel<int64_t>,
-                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
deleted file mode 100644
index ae51f1221cc..00000000000
--- a/paddle/fluid/operators/linspace_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CPULinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    T start = start_t.data<T>()[0];
-    T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    if (num > 1) {
-      // step should be of double type for all types
-      double step = (static_cast<double>(stop - start)) / (num - 1);
-      int half_num = num / 2;
-      for (int i = 0; i < num; ++i) {
-        if (i < half_num) {
-          out_data[i] = static_cast<T>(start + step * i);
-        } else {
-          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
-        }
-      }
-    } else {
-      out_data[0] = static_cast<T>(start);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index c3472a24801..eb807ad4615 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -209,4 +209,33 @@ void LerpInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto step_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (step_dims.size() == 1) && (step_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   step_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index cff57e1ba70..4dec1442516 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -58,4 +58,9 @@ void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& weight,
                    MetaTensor* out);
 
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc
new file mode 100644
index 00000000000..4b8b7f7a2e0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/linspace_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  int32_t num = number.data<int32_t>()[0];
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  T start_data = start_t.template data<T>()[0];
+  T stop_data = stop_t.template data<T>()[0];
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (num > 1) {
+    // step should be of double type for all types
+    double step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int half_num = num / 2;
+    for (int i = 0; i < num; ++i) {
+      if (i < half_num) {
+        out_data[i] = static_cast<T>(start_data + step * i);
+      } else {
+        out_data[i] = static_cast<T>(stop_data - step * (num - i - 1));
+      }
+    }
+  } else {
+    out_data[0] = static_cast<T>(start_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h
new file mode 100644
index 00000000000..ad7f2aa192c
--- /dev/null
+++ b/paddle/phi/kernels/funcs/data_type_transform.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+phi::DenseTensor TransDataType(const Context& dev_ctx,
+                               const phi::DenseTensor& x,
+                               DataType dtype) {
+  VLOG(3) << "TransDataType "
+          << "src type:" << x.dtype() << "; dst typoe: " << dtype;
+
+  switch (x.dtype()) {
+    case DataType::FLOAT32:
+      return phi::Cast<float>(dev_ctx, x, dtype);
+    case DataType::FLOAT64:
+      return phi::Cast<double>(dev_ctx, x, dtype);
+    case DataType::INT32:
+      return phi::Cast<int32_t>(dev_ctx, x, dtype);
+    case DataType::INT64:
+      return phi::Cast<int64_t>(dev_ctx, x, dtype);
+    case DataType::FLOAT16:
+      return phi::Cast<phi::dtype::float16>(dev_ctx, x, dtype);
+    case DataType::BFLOAT16:
+      return phi::Cast<phi::dtype::bfloat16>(dev_ctx, x, dtype);
+    case DataType::BOOL:
+      return phi::Cast<bool>(dev_ctx, x, dtype);
+    case DataType::INT16:
+      return phi::Cast<int16_t>(dev_ctx, x, dtype);
+    case DataType::UINT8:
+      return phi::Cast<uint8_t>(dev_ctx, x, dtype);
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          x.dtype()));
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
new file mode 100644
index 00000000000..3a6ff365c11
--- /dev/null
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void LinspaceKernelInner(
+    T start, T stop, double step, int64_t size, T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
+  }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = static_cast<T>(start);
+}
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  DenseTensor n_start;
+  DenseTensor n_stop;
+  DenseTensor n_num;
+  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
+  T start_data = n_start.data<T>()[0];
+  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
+  T stop_data = n_stop.data<T>()[0];
+  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
+  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  double step = 0;
+  auto stream = ctx.stream();
+  int block = 512;
+  int grid = (num + block - 1) / block;
+  if (num != 1) {
+    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    LinspaceKernelInner<T><<<grid, block, 0, stream>>>(
+        start_data, stop_data, step, num, out_data);
+  } else {
+    LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start_data, out_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/linspace_kernel.h b/paddle/phi/kernels/linspace_kernel.h
new file mode 100644
index 00000000000..ca2b940aef9
--- /dev/null
+++ b/paddle/phi/kernels/linspace_kernel.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
-- 
GitLab


From 05ff6cc52d309ccfba217225f62b1bc427d626e2 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Wed, 9 Mar 2022 12:18:38 +0800
Subject: [PATCH 203/272] bypass eager mode (#40245)

---
 .../paddle/fluid/tests/unittests/test_function_hook.py   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py
index d45ef528261..55981b01c40 100644
--- a/python/paddle/fluid/tests/unittests/test_function_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_function_hook.py
@@ -20,6 +20,7 @@ import numpy as np
 
 import paddle.fluid.core as core
 from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestCapture:
@@ -41,7 +42,7 @@ def grad_hook(grad):
 
 
 class TestBakcwardFunctionHookError(unittest.TestCase):
-    def test_hook(self):
+    def func_hook(self):
         input_data = np.ones([4, 4]).astype('float32')
 
         x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False)
@@ -58,6 +59,12 @@ class TestBakcwardFunctionHookError(unittest.TestCase):
 
         assert test_cap.list == [1, 2, 1]
 
+    def test_hook(self):
+        # _register_void_function_post_hook do not support in eager mode
+        with _test_eager_guard():
+            pass
+        self.func_hook()
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From c1116b657ee99f8501ff065578fe8b07de97e889 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Wed, 9 Mar 2022 12:57:20 +0800
Subject: [PATCH 204/272] Fix code style (#40344)

* fix code style

* test=document_fix

* fix code style
---
 python/paddle/profiler/profiler_statistic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index e39871c7365..7400f21e913 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -170,9 +170,9 @@ class TimeRangeSummary:
                 CPUTimeRange[hostnode.type].append(
                     (hostnode.start_ns, hostnode.end_ns))
                 self.call_times[hostnode.type] += 1
-                if hostnode.type == TracerEventType.Operator and any(
-                    [name in hostnode.name for name in
-                     _CommunicationOpName]):  # special case, communication op
+                if hostnode.type == TracerEventType.Operator and any([
+                        name in hostnode.name for name in _CommunicationOpName
+                ]):  # special case, communication op
                     CPUTimeRange[TracerEventType.Communication].append(
                         (hostnode.start_ns, hostnode.end_ns))
                     self.call_times[TracerEventType.Communication] += 1
-- 
GitLab


From e0866dc630dc8dc81567d0644c0688976132eb2c Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Wed, 9 Mar 2022 13:53:03 +0800
Subject: [PATCH 205/272] [hybrid] fused_feedforward op support tensor model
 parallel (#40160)

---
 .../operators/fused/fused_feedforward_op.cc   |   2 +
 .../operators/fused/fused_feedforward_op.cu   |  48 ++-
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 ...static_model_parallel_fused_feedforward.py | 384 ++++++++++++++++++
 ...static_model_parallel_fused_feedforward.py |  45 ++
 5 files changed, 476 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py

diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 0c8eae42604..f3f8f174275 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<int>("dropout1_seed", "Dropout1 random seed.").SetDefault(0);
     AddAttr<int>("dropout2_seed", "Dropout2 random seed.").SetDefault(0);
+    AddAttr<int>("ring_id", "ring id for tensor model parallel.")
+        .SetDefault(-1);
     AddComment(R"DOC(
         the function of fused_feedforward operator is the same as the following pseudo code:
         residual = src;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 3131269955b..c38d9f7d4bc 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -21,11 +21,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor& tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext& ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void* sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void* recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class FusedFeedForwardKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
            framework::Tensor* dropout1_out, framework::Tensor* dropout2_out,
            const int bsz_seq, const int d_model, const int dim_feedforward,
            const std::string& act_method, const bool pre_layer_norm,
-           const float epsilon1, const float epsilon2,
+           const float epsilon1, const float epsilon2, const int ring_id,
            const DropoutParam& dropout_param1,
            const DropoutParam& dropout_param2,
            const platform::CUDADeviceContext& ctx) const {
@@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     framework::Tensor linear2_out;
     linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
     MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
+
+    // tensor model parallel
+    AllReduce<T>(linear2_out, ring_id, ctx);
+
     if (!pre_layer_norm) {
       fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
           ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
@@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
 
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
         dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance,
         linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model,
         dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2,
-        dropout_param1, dropout_param2, context.cuda_device_context());
+        ring_id, dropout_param1, dropout_param2, context.cuda_device_context());
   }
 };
 
@@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       const int dim_feedforward, const DropoutParam& dropout_param1,
       const DropoutParam& dropout_param2, const std::string& act_method,
       const bool pre_layer_norm, const float epsilon1, const float epsilon2,
-      const platform::CUDADeviceContext& ctx) const {
+      const int ring_id, const platform::CUDADeviceContext& ctx) const {
     FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
         bsz_seq, d_model, epsilon1);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
@@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
       MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out,
                  d_linear1_weight);
-
+      // tensor model parallel
+      AllReduce<T>(d_ln1_out, ring_id, ctx);
       pre_layernorm_helper.LayerNormGrad(
           ctx, d_ln1_out.data<T>(), x.data<T>(), ln1_gamma_ptr,
           ln1_mean->data<U>(), ln1_variance->data<U>(), d_x->data<T>(),
           d_ln1_gamma_ptr, d_ln1_beta_ptr);
     } else {
       MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx);
     }
     std::vector<const Tensor*> ins(2);
     std::vector<Tensor*> outs(1);
@@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
     const std::string act_method = context.Attr<std::string>("act_method");
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
             d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale,
             d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model,
             dim_feedforward, dropout_param1, dropout_param2, act_method,
-            pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context());
+            pre_layer_norm, epsilon1, epsilon2, ring_id,
+            context.cuda_device_context());
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f8102ec4080..be91fb4fdf6 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -1150,6 +1151,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
+        set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
new file mode 100644
index 00000000000..5f467da6a64
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -0,0 +1,384 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+
+def fused_feedforward(x,
+                      linear1_weight,
+                      linear2_weight,
+                      linear1_bias=None,
+                      linear2_bias=None,
+                      ln1_scale=None,
+                      ln1_bias=None,
+                      ln2_scale=None,
+                      ln2_bias=None,
+                      dropout1_rate=0.5,
+                      dropout2_rate=0.5,
+                      activation="relu",
+                      ln1_epsilon=1e-5,
+                      ln2_epsilon=1e-5,
+                      pre_layer_norm=False,
+                      training=True,
+                      mode='upscale_in_train',
+                      ring_id=-1,
+                      name=None):
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    helper = LayerHelper("fused_feedforward")
+    dtype = x.dtype
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'fused_feedforward')
+    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                'fused_feedforward')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    dropout1_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    dropout2_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+            'dropout1_is_test': not training,
+            'dropout2_is_test': not training,
+            'dropout1_fix_seed': seed is not None,
+            'dropout2_fix_seed': seed is not None,
+            'dropout1_seed': seed if seed is not None else 0,
+            'dropout2_seed': seed if seed is not None else 0,
+            'dropout1_implementation': mode,
+            'dropout2_implementation': mode,
+            'ring_id': ring_id,
+        })
+    return out
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedFeedForward(Layer):
+    def __init__(self,
+                 d_model,
+                 dim_feedforward,
+                 dropout_rate=0.1,
+                 epsilon=1e-05,
+                 activation="relu",
+                 act_dropout_rate=None,
+                 normalize_before=False,
+                 linear1_weight_attr=None,
+                 linear1_bias_attr=None,
+                 linear2_weight_attr=None,
+                 linear2_bias_attr=None,
+                 ln1_scale_attr=None,
+                 ln1_bias_attr=None,
+                 ln2_scale_attr=None,
+                 ln2_bias_attr=None,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedFeedForward, self).__init__()
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, but recieved {}".format(
+                d_model))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            format(dim_feedforward))
+
+        self._dtype = self._helper.get_default_dtype()
+        self._d_model = d_model
+
+        assert dim_feedforward % nranks == 0
+        dim_feedforward = dim_feedforward // nranks
+        self._dim_feedforward = dim_feedforward
+        self._dropout_rate = dropout_rate
+        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_method = activation
+        self._normalize_before = normalize_before
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self._linear1_weight = self.create_parameter(
+            shape=[d_model, dim_feedforward],
+            attr=linear1_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        self._linear2_weight = self.create_parameter(
+            shape=[dim_feedforward, d_model],
+            attr=linear2_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self._linear1_weight)
+            _set_var_distributed(self._linear1_bias)
+            _set_var_distributed(self._linear2_weight)
+
+        if normalize_before:
+            self._ln1_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln1_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
+            self._ln2_scale = None
+            self._ln2_bias = None
+        else:
+            self._ln1_bias = None
+            self._ln2_bias = None
+            self._ln2_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln2_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
+
+        self.name = name
+
+    def forward(self, src, cache=None):
+        out = fused_feedforward(
+            src,
+            self._linear1_weight,
+            self._linear2_weight,
+            self._linear1_bias,
+            self._linear2_bias,
+            self._ln1_scale,
+            self._ln1_bias,
+            self._ln2_scale,
+            self._ln2_bias,
+            dropout1_rate=self._act_dropout_rate,
+            dropout2_rate=self._dropout_rate,
+            activation=self._act_method,
+            ln1_epsilon=self._epsilon,
+            ln2_epsilon=self._epsilon,
+            pre_layer_norm=self._normalize_before,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    ln_w = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    ln_b = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    w0 = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    b0 = np.random.uniform(-1, 1, size=(OUT_SIZE, )).astype(DTYPE)
+    w1 = np.random.uniform(-1, 1, size=(OUT_SIZE, IN_SIZE)).astype(DTYPE)
+    b1 = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else OUT_SIZE // MODEL_PARALLEL_SIZE
+        end = start + OUT_SIZE // MODEL_PARALLEL_SIZE
+        col_w0 = w0[:, start:end]
+        col_b0 = b0[start:end]
+        row_w1 = w1[start:end, :]
+
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
+        w1_attr, b1_attr = get_param_attr(row_w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        #ffn.eval()
+        result = ffn(data)
+    else:
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(w0, b0)
+        w1_attr, b1_attr = get_param_attr(w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr)
+        #ffn.eval()
+        result = ffn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
new file mode 100644
index 00000000000..1a6b637e1b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_feedforward.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 7ea9235c4b3cfcb80fbc1cf286fedaafa3ff1221 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Wed, 9 Mar 2022 14:13:49 +0800
Subject: [PATCH 206/272] Fix time of utest in distributed (#40163)

* fix time of utest
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index be91fb4fdf6..5d861cddea2 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1114,10 +1114,10 @@ set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
-- 
GitLab


From aeaf69b36de112e57e8e5bd01caa0e43a497c31b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 9 Mar 2022 14:20:12 +0800
Subject: [PATCH 207/272] remove determinant deps for svd helper (#40235)

---
 paddle/fluid/operators/determinant_op.h | 71 +++++++++++++------------
 paddle/phi/kernels/full_kernel.h        | 12 +++++
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 463a707ccf1..f89ecd37222 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,11 +19,17 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -172,7 +178,7 @@ template <typename DeviceContext, typename T>
 class DeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* det = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
       // checked in forward, pass
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (det(A)=0)
     if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       ddet->Resize(input->dims());
-      ddet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, ddet, static_cast<T>(0.0f));
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
+                   ddet);
       return;
     }
 
@@ -218,8 +227,6 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
     // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
     // -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
@@ -227,26 +234,28 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
     inverse_A.mutable_data<T>(context.GetPlace());
 
     phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
     VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
     // Third: dA * |A|
-    auto mul_dA_detA = helper.Mul(*grad, *det);
+    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
     VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
 
     // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
 
     VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
 
@@ -331,7 +340,7 @@ template <typename DeviceContext, typename T>
 class SlogDeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* slogdet = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
               input->dims().size() - grad->dims().size()));
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
@@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
-      dslogdet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()),
+                   std::numeric_limits<T>::quiet_NaN(), dslogdet);
       return;
     }
 
@@ -373,8 +385,6 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
     // inverse(A).conj().transpose(-2, -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
@@ -382,25 +392,18 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     inverse_A.mutable_data<T>(context.GetPlace());
 
     phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).conj()
-    framework::Tensor conj_inverse_A;
-    conj_inverse_A.Resize(inverse_A.dims());
-    auto numel = input->numel();
-    auto* conj_data = conj_inverse_A.mutable_data<T>(context.GetPlace(),
-                                                     size_t(numel * sizeof(T)));
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
-    for_range(functor);
+    auto conj_inverse_A = phi::Conj<T>(dev_ctx, inverse_A);
 
     VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
 
     // Third: inverse(A).conj().transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, conj_inverse_A);
     VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
@@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     det_grad.Resize(det_grad.dims().reshape(det_grad_vec));
 
     // Fifth: unsqueeze(dslA, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(det_grad, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dslA) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
     VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();
 
     framework::TensorCopy(res, context.GetPlace(), dslogdet);
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index c44f048051d..41fc96b6db1 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -37,6 +37,18 @@ void FullLikeKernel(const Context& dev_ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void Full(const Context& dev_ctx,
+          const ScalarArray& shape,
+          const Scalar& val,
+          DenseTensor* out) {
+  FullKernel<T, Context>(dev_ctx,
+                         shape,
+                         val,
+                         paddle::experimental::CppTypeToDataType<T>::Type(),
+                         out);
+}
+
 template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
-- 
GitLab


From 767647ceaf145787df66f3a5a00806f76946bfd4 Mon Sep 17 00:00:00 2001
From: huzhiqiang <912790387@qq.com>
Date: Wed, 9 Mar 2022 00:26:58 -0600
Subject: [PATCH 208/272] [Infrt]Update kernel dialect (#40141)

---
 .gitignore                                    |  1 +
 .../pybind/kernel_signature_generator.cc      | 38 ++++---
 .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc |  6 +-
 paddle/infrt/host_context/paddle_mlir.cc      | 17 ++--
 .../infershaped/infershape_launchers_test.cc  |  2 +-
 .../infrt/tests/dialect/phi/dense_tensor.mlir |  2 +-
 paddle/scripts/infrt_build.sh                 |  5 +-
 tools/infrt/generate_phi_kernel_dialect.py    | 98 +++++++++++--------
 tools/infrt/get_compat_kernel_signature.py    | 77 +++++++++++++++
 tools/infrt/get_phi_kernel_info.py            |  4 +-
 10 files changed, 174 insertions(+), 76 deletions(-)
 create mode 100644 tools/infrt/get_compat_kernel_signature.py

diff --git a/.gitignore b/.gitignore
index 21222678f04..801790d0a47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
+tools/infrt/kernel_signature.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index f0d5a4e477f..8d78adaf5a4 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -44,35 +44,41 @@ int main(int argc, char **argv) {
   paddle::framework::InitDefaultKernelSignatureMap();
   auto &kernel_signature_map = phi::DefaultKernelSignatureMap::Instance();
   auto &kernel_factory = phi::KernelFactory::Instance();
-  std::cout << "{";
+  std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
     if (kernel_signature_map.Has(op_kernel_pair.first)) {
-      std::cout << "\"" << op_kernel_pair.first << "\":{";
+      kernel_signature_map_str =
+          kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
       auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
 
-      std::cout << "\"inputs\":[";
+      kernel_signature_map_str += "\"inputs\":[";
       auto inputs_ = std::get<0>(args);
-      if (inputs_.size() > 0) std::cout << inputs_[0];
-      for (size_t i = 1; i < inputs_.size(); i++) {
-        std::cout << ",\"" << inputs_[i] << "\"";
+      for (size_t i = 0; i < inputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + inputs_[i] + "\",";
       }
+      if (inputs_.size()) kernel_signature_map_str.pop_back();
 
-      std::cout << "],\"attrs\":[";
+      kernel_signature_map_str += "],\"attrs\":[";
       auto attrs_ = std::get<1>(args);
-      if (attrs_.size() > 0) std::cout << attrs_[0];
-      for (size_t i = 1; i < attrs_.size(); i++) {
-        std::cout << ",\"" << attrs_[i] << "\"";
+      for (size_t i = 0; i < attrs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + attrs_[i] + "\",";
       }
-
-      std::cout << "],\"outputs\":[";
+      if (attrs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "],\"outputs\":[";
       auto outputs_ = std::get<2>(args);
-      for (size_t i = 1; i < outputs_.size(); i++) {
-        std::cout << ",\"" << outputs_[i] << "\"";
+      for (size_t i = 0; i < outputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + outputs_[i] + "\",";
       }
 
-      std::cout << "]},";
+      if (outputs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "]},";
     }
   }
-  std::cout << "}" << std::endl;
+  kernel_signature_map_str.pop_back();
+  kernel_signature_map_str += "}\n";
+  std::cout << kernel_signature_map_str;
   return 0;
 }
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
index 376ab31938a..4347ec19e81 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -125,10 +125,8 @@ void phiOpCvtPass::diapatchStage() {
 
     kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
                   kernel_name +
-                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout) +
-                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision);
-
-    // mlir::OperationName operation_name = kernel_op.getOperation()->getName();
+                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
+                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
 
     mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
     mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 1c36b04f366..83a2a4269c3 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -56,6 +56,7 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
+
   return module_;
 }
 
@@ -143,13 +144,14 @@ void MLIRModelGenImpl::UpdateModelParams(
     const infrt::paddle::framework_proto::ProgramDesc &program,
     mlir::FuncOp *mainFunc) {
   // update input vars
+  int input_index = 1;
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() == "feed") {
       for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
         // update input variables
         auto &in = op_desc.outputs()[var_idx];
         std::string input_var_name = in.arguments(0);
-        ::mlir::Value input_ = mainFunc->getArgument(1);
+        ::mlir::Value input_ = mainFunc->getArgument(input_index++);
         params_map_.insert(
             std::pair<std::string, mlir::Value>(input_var_name, input_));
       }
@@ -211,7 +213,6 @@ void MLIRModelGenImpl::buildOperation(
     const infrt::paddle::framework_proto::OpDesc &op_) {
   const std::string &op_name = "pd." + op_.type();
   mlir::Location loc = mlir::UnknownLoc::get(context_);
-
   llvm::SmallVector<mlir::Value, 4> operands = GetOpInputValue(op_);
   llvm::SmallVector<mlir::Type, 4> resultTypes = GetOpOutputType(op_);
   llvm::SmallVector<mlir::NamedAttribute, 4> attrs = GetOpAttributes(op_);
@@ -227,7 +228,6 @@ llvm::SmallVector<mlir::Value, 4> MLIRModelGenImpl::GetOpInputValue(
   std::unordered_map<std::string, uint8_t> inputs_info = {};
   if (pd_dialect_inputs_info_map_.count(op_.type()))
     inputs_info = pd_dialect_inputs_info_map_.at(op_.type());
-
   for (int var_idx = 0; var_idx < op_.inputs_size(); ++var_idx) {
     auto &var = op_.inputs(var_idx);
     if (!var.arguments().empty()) {
@@ -249,10 +249,8 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
   // update op outputs info
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
     auto &var_name = op_.outputs(var_idx).arguments()[0];
-
     if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
       continue;
-
     // update persistable tensors
     for (int i = 0; i < main_block_.vars_size(); i++) {
       auto var_desc = main_block_.vars(i);
@@ -315,7 +313,6 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
   llvm::ArrayRef<mlir::StringAttr> attr_names_ =
       registered_op_name_.getAttributeNames();
   std::vector<mlir::StringAttr> attr_names_vec_ = attr_names_.vec();
-
   // update attrs
   for (int attrs_num = 0; attrs_num < op_.attrs_size(); attrs_num++) {
     auto attr_name_ = op_.attrs(attrs_num).name();
@@ -351,11 +348,17 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
 void MLIRModelGenImpl::RegisterOpOutputVars(
     const infrt::paddle::framework_proto::OpDesc &op_,
     mlir::Operation *mlir_op_) {
+  std::unordered_map<std::string, uint8_t> pd_dialect_outputs_info =
+      pd_dialect_outputs_info_map_.at(op_.type());
+
   // op outputs
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
+    if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
+      continue;
     auto &var_name = op_.outputs(var_idx).arguments()[0];
+    int index = pd_dialect_outputs_info[op_.outputs(var_idx).parameter()];
     // output name
-    auto var_ = mlir_op_->getResult(var_idx);
+    auto var_ = mlir_op_->getResult(index);
     params_map_.insert(std::pair<std::string, mlir::Value>(var_name, var_));
   }
 }
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 37f9197edb7..08c2e19dedd 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("phi_cpu.add.any.float32");
+  auto creator = registry.GetKernel("phi_cpu.add.float32.any");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index 586af7a9c50..b2e1cc52be6 100644
--- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -6,7 +6,7 @@ func @sign_any_float32_execute() {
   %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator<CPU>) -> !phi.context<CPU>
   %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // CHECK: dense_tensor: shape=shape[1], values=[1]
   "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index fb7be82d1c5..0ba2dae9096 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -33,16 +33,17 @@ function update_pd_ops() {
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
    cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
-   make -j8 paddle_python print_pten_kernels
+   make -j8 paddle_python print_pten_kernels kernel_signature_generator
    cd ${PADDLE_ROOT}/build
    ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
+   ./paddle/fluid/pybind/kernel_signature_generator > ../tools/infrt/kernel_signature.json
    cd python/dist/
    python3 -m pip uninstall -y paddlepaddle
    python3 -m pip install  *whl
    # update pd_ops.td
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
-   python3 generate_phi_kernel_dialect.py ./kernels.json
+   python3 generate_phi_kernel_dialect.py
 }
 
 function init() {
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index f3a78a8d4e8..36561d4e71d 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -14,9 +14,16 @@
 
 import json
 import sys
-
-attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'}
-supported_kernels = ['sign', 'dot', 'digamma', 'conj', 'abs', 'add_raw']
+import os
+from get_compat_kernel_signature import get_compat_kernels_info
+
+#TODO @DannyIsFunny: more attr types need to be supported.
+attr_type_converter = {
+    "i": 'SI32Attr',
+    "b": 'BoolAttr',
+    "l": 'SI64Attr',
+    "f": 'F32Attr'
+}
 
 target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
 layout_type_converter = {
@@ -39,40 +46,34 @@ precision_type_converter = {
     "bool": "BOOL"
 }
 
+kernel_types_info_file = "./kernels.json"
+kernel_signature_info_file = "./kernel_signature.json"
+
 
 def generate_kernel_name(op_name, place_str):
     [target_, layout_, precision_] = place_str[1:-1].split(',')
     target_ = target_type_converter[target_.strip()]
     layout_ = layout_type_converter[layout_.strip()]
     precision_ = precision_type_converter[precision_.strip()]
+    class_name_ = "{}{}".format(
+        op_name.replace("_", "").title(), "".join([
+            target_.strip().title(), precision_.strip(), layout_.strip().title()
+            .title()
+        ]))
     alias_ = "{}.{}".format(op_name, ".".join(
-        [target_.strip(), layout_.strip(), precision_.strip()]))
-    return alias_
+        [target_.strip(), precision_.strip(), layout_.strip()]))
+    return alias_, class_name_
 
 
 def generate_attrs_info(op_name, attrs_info):
-    kernel_attrs_names = {
-        'split': ['sections', 'num', 'axis', 'mkldnn_data_type'],
-        'sign': [],
-        'masked_select': [],
-        'trace': ['offset', 'axis1', 'axis2'],
-        'concat': ['axis'],
-        'empty': ['shape', 'dtype'],
-        'conj': [],
-        'norm': ['axis', 'epsilon', 'is_test'],
-        'histogram': ['bins', 'min', 'max'],
-        'dot': [],
-        'scale': ['scale', 'bias', 'bias_after_scale'],
-        'digamma': [],
-        'lerp': [],
-        'cast': ['out_dtype', 'in_dtype'],
-        'abs': [],
-        'add_raw': ['axis'],
-    }
+    kernel_attrs_names = {}
     attrs_args_ = ""
-    if len(kernel_attrs_names[op_name]) == len(attrs_info):
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
+    if len(kernel_attrs_names[op_name]["attrs"]) == len(attrs_info):
         for index in range(len(attrs_info)):
-            attr_name = kernel_attrs_names[op_name][index]
+            attr_name = kernel_attrs_names[op_name]["attrs"][index]
             attr_type = attr_type_converter[attrs_info[index]]
             attrs_args_ += '{type_}:${name_},'.format(
                 type_=attr_type, name_=attr_name)
@@ -97,7 +98,11 @@ def generate_arguments_info(op_name, input_info, attr_info):
     input_args = generate_inputs_info(input_info)
     attr_args = generate_attrs_info(op_name, attr_info)
     context_args = "Context:$dev_ctx"
-    argument_ = "{},{},{}".format(context_args, input_args, attr_args)
+    argument_list = [context_args] + input_args.split(",") + attr_args.split(
+        ",")
+    while ("" in argument_list):
+        argument_list.remove("")
+    argument_ = ",".join(argument_list)
     return (("let arguments = (ins {});".format(argument_.strip(","))))
 
 
@@ -116,6 +121,10 @@ def generate_results_info(output_info):
 
 def generate_supported_kernel_list(load_dict):
     supported_kernels_list_ = []
+    kernel_attrs_names = {}
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
     for op_name in load_dict:
         kernel_list = load_dict[op_name]
         for kernel_info in kernel_list:
@@ -125,13 +134,10 @@ def generate_supported_kernel_list(load_dict):
                 for attribute in attributes:
                     if attribute not in attr_type_converter:
                         flag = False
-                if flag:
+                if flag and op_name in kernel_attrs_names:
                     supported_kernels_list_.append(op_name)
-
-                alias_ = generate_kernel_dialect(op_name, kernel_alias_,
-                                                 kernel_info[kernel_alias_])
     supported_kernels_list_ = list(set(supported_kernels_list_))
-    print(supported_kernels_list_)
+    return supported_kernels_list_
 
 
 def scan_kernel_info(load_dict):
@@ -156,16 +162,14 @@ def scan_kernel_info(load_dict):
 
 def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
-    alias = generate_kernel_name(op_name, kernel_alias_)
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
     summary = 'let summary = "{name}";'.format(name=alias)
     dialect_name = alias.split(".")
     dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
         3]
 
     header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
-        kernel_name=alias.replace(".", ""),
-        name=dialect_name.lower(),
-        left_brace="{")
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
 
     inputs_ = kernel_info["input"]
     attributes = kernel_info["attribute"]
@@ -185,16 +189,14 @@ def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
 def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
-    alias = generate_kernel_name(op_name, kernel_alias_)
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
     summary = 'let summary = "{name}";'.format(name=alias)
     dialect_name = alias.split(".")
     dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
         3]
 
     header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
-        kernel_name=alias.replace(".", ""),
-        name=dialect_name.lower(),
-        left_brace="{")
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
     inputs_ = kernel_info["input"]
     attributes = kernel_info["attribute"]
     arguments = generate_arguments_info(op_name, inputs_, attributes)
@@ -236,14 +238,17 @@ def get_kernel_target(kernel_alias_):
     return target[0]
 
 
-def main(path_):
-    with open(path_, "r") as f:
+def main():
+    with open(kernel_types_info_file, "r") as f:
         load_dict = json.load(f)
 
         head = generate_dialect_head()
 
         cpu_registry_ = ""
         gpu_registry_ = ""
+        supported_kernels = generate_supported_kernel_list(load_dict)
+        print("Supported kernels:")
+        print(supported_kernels)
         for op_name in load_dict:
             if op_name not in supported_kernels:
                 continue
@@ -273,5 +278,12 @@ def main(path_):
 
 
 if __name__ == '__main__':
-    path = sys.argv[1]
-    main(path)
+    if not os.path.exists(kernel_types_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_types_info_file))
+    if not os.path.exists(kernel_signature_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_signature_info_file))
+    if os.path.exists(kernel_types_info_file) and os.path.exists(
+            kernel_signature_info_file):
+        main()
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
new file mode 100644
index 00000000000..78d59c2aef1
--- /dev/null
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import json
+
+
+def parse_compat_registry(kernel_info):
+    name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{")
+    kernel_info = {}
+    kernel_info["inputs"] = inputs_str[:-1].split(",")
+    kernel_info["attrs"] = attrs_str[:-1].split(",")
+    kernel_info["outputs"] = outputs_str[:-1].split(",")
+    return name, kernel_info
+
+
+def remove_grad_registry(kernels_registry):
+    clean_kernel_registry = {}
+    for registry in kernels_registry:
+        if (not "_grad" in registry):
+            clean_kernel_registry[registry] = kernels_registry[registry]
+    return clean_kernel_registry
+
+
+def get_compat_kernels_info():
+    kernels_info = {}
+    compat_files = os.listdir("../../paddle/phi/ops/compat")
+    for file_ in compat_files:
+        if not ".cc" in file_:
+            compat_files.remove(file_)
+
+    for file_ in compat_files:
+        with open("../../paddle/phi/ops/compat/" + file_) as in_file:
+            txt = in_file.readlines()
+            content = ""
+            registry = False
+            for line in txt:
+                if ("KernelSignature(" in line):
+                    content = ""
+                    registry = True
+                if (registry):
+                    content += line
+                if (registry and ";" in line):
+                    data = content.replace("\n", "").replace(
+                        " ", "").strip("return").strip(
+                            "KernelSignature(").strip("\);").replace("\"", "")
+                    registry = False
+                    name, registry_info = parse_compat_registry(data)
+
+                    if name in kernels_info:
+                        cur_reg = kernels_info[name]
+                        kernels_info[name]["inputs"] = list(
+                            set(registry_info["inputs"] + kernels_info[name][
+                                "inputs"]))
+                        kernels_info[name]["attrs"] = list(
+                            set(registry_info["attrs"] + kernels_info[name][
+                                "attrs"]))
+                        kernels_info[name]["outputs"] = list(
+                            set(registry_info["outputs"] + kernels_info[name][
+                                "outputs"]))
+                    else:
+                        kernels_info[name] = registry_info
+
+    compat_registry_ = remove_grad_registry(kernels_info)
+    return compat_registry_
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 9ea3fef0030..774f6cd6bf3 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[
-                2].lower() + '.' + ir_dtype
+            ir_name = 'phi_cpu.' + update_item[0].lower(
+            ) + '.' + ir_dtype + '.' + update_item[2].lower()
             res += f"""
   registry->AddKernel("{ir_name}","""
 
-- 
GitLab


From 68af310be1a7efc53c61d3982aa080b68e0e5263 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Wed, 9 Mar 2022 14:27:46 +0800
Subject: [PATCH 209/272] add MobileNetV3 (#38653)

* add mobilenetv3
---
 python/paddle/tests/test_pretrained_model.py |   2 +
 python/paddle/tests/test_vision_models.py    |   6 +
 python/paddle/vision/__init__.py             |   4 +
 python/paddle/vision/models/__init__.py      |   8 +
 python/paddle/vision/models/mobilenetv2.py   |  16 +-
 python/paddle/vision/models/mobilenetv3.py   | 445 +++++++++++++++++++
 python/paddle/vision/models/utils.py         |  32 ++
 python/paddle/vision/ops.py                  |  56 ++-
 8 files changed, 554 insertions(+), 15 deletions(-)
 create mode 100644 python/paddle/vision/models/mobilenetv3.py
 create mode 100644 python/paddle/vision/models/utils.py

diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index bbde64f2e60..4441faee14e 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -61,6 +61,8 @@ class TestPretrainedModel(unittest.TestCase):
         arches = [
             'mobilenet_v1',
             'mobilenet_v2',
+            'mobilenet_v3_small',
+            'mobilenet_v3_large',
             'squeezenet1_0',
             'shufflenet_v2_x0_25',
         ]
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 547c5334599..dc98fc3219b 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -40,6 +40,12 @@ class TestVisonModels(unittest.TestCase):
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
 
+    def test_mobilenetv3_small(self):
+        self.models_infer('mobilenet_v3_small')
+
+    def test_mobilenetv3_large(self):
+        self.models_infer('mobilenet_v3_large')
+
     def test_vgg11(self):
         self.models_infer('vgg11')
 
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index 37520175a71..3749e0f64fc 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -40,6 +40,10 @@ from .models import MobileNetV1  # noqa: F401
 from .models import mobilenet_v1  # noqa: F401
 from .models import MobileNetV2  # noqa: F401
 from .models import mobilenet_v2  # noqa: F401
+from .models import MobileNetV3Small  # noqa: F401
+from .models import MobileNetV3Large  # noqa: F401
+from .models import mobilenet_v3_small  # noqa: F401
+from .models import mobilenet_v3_large  # noqa: F401
 from .models import SqueezeNet  # noqa: F401
 from .models import squeezenet1_0  # noqa: F401
 from .models import squeezenet1_1  # noqa: F401
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 044be6a42b7..5ff3562e56e 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -24,6 +24,10 @@ from .mobilenetv1 import MobileNetV1  # noqa: F401
 from .mobilenetv1 import mobilenet_v1  # noqa: F401
 from .mobilenetv2 import MobileNetV2  # noqa: F401
 from .mobilenetv2 import mobilenet_v2  # noqa: F401
+from .mobilenetv3 import MobileNetV3Small  # noqa: F401
+from .mobilenetv3 import MobileNetV3Large  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_small  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_large  # noqa: F401
 from .vgg import VGG  # noqa: F401
 from .vgg import vgg11  # noqa: F401
 from .vgg import vgg13  # noqa: F401
@@ -79,6 +83,10 @@ __all__ = [ #noqa
     'mobilenet_v1',
     'MobileNetV2',
     'mobilenet_v2',
+    'MobileNetV3Small',
+    'MobileNetV3Large',
+    'mobilenet_v3_small',
+    'mobilenet_v3_large',
     'LeNet',
     'DenseNet',
     'densenet121',
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 74071fc1216..6c486037c7d 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle
-
 import paddle.nn as nn
-import paddle.nn.functional as F
-
 from paddle.utils.download import get_weights_path_from_url
 
+from .utils import _make_divisible
+
 __all__ = []
 
 model_urls = {
@@ -29,16 +27,6 @@ model_urls = {
 }
 
 
-def _make_divisible(v, divisor, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
 class ConvBNReLU(nn.Sequential):
     def __init__(self,
                  in_planes,
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
new file mode 100644
index 00000000000..da7ae010c58
--- /dev/null
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.utils.download import get_weights_path_from_url
+from functools import partial
+
+from .utils import _make_divisible
+from ..ops import ConvNormActivation
+
+__all__ = []
+
+model_urls = {
+    "mobilenet_v3_small_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_small_x1.0.pdparams",
+     "34fe0e7c1f8b00b2b056ad6788d0590c"),
+    "mobilenet_v3_large_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_large_x1.0.pdparams",
+     "118db5792b4e183b925d8e8e334db3df"),
+}
+
+
+class SqueezeExcitation(nn.Layer):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., paddle.nn.Layer], optional): ``delta`` activation. Default: ``paddle.nn.ReLU``
+        scale_activation (Callable[..., paddle.nn.Layer]): ``sigma`` activation. Default: ``paddle.nn.Sigmoid``
+    """
+
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 activation=nn.ReLU,
+                 scale_activation=nn.Sigmoid):
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+        self.fc1 = nn.Conv2D(input_channels, squeeze_channels, 1)
+        self.fc2 = nn.Conv2D(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input):
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+    def forward(self, input):
+        scale = self._scale(input)
+        return scale * input
+
+
+class InvertedResidualConfig:
+    def __init__(self,
+                 in_channels,
+                 kernel,
+                 expanded_channels,
+                 out_channels,
+                 use_se,
+                 activation,
+                 stride,
+                 scale=1.0):
+        self.in_channels = self.adjust_channels(in_channels, scale=scale)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(
+            expanded_channels, scale=scale)
+        self.out_channels = self.adjust_channels(out_channels, scale=scale)
+        self.use_se = use_se
+        if activation is None:
+            self.activation_layer = None
+        elif activation == "relu":
+            self.activation_layer = nn.ReLU
+        elif activation == "hardswish":
+            self.activation_layer = nn.Hardswish
+        else:
+            raise RuntimeError("The activation function is not supported: {}".
+                               format(activation))
+        self.stride = stride
+
+    @staticmethod
+    def adjust_channels(channels, scale=1.0):
+        return _make_divisible(channels * scale, 8)
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, expanded_channels, out_channels,
+                 filter_size, stride, use_se, activation_layer, norm_layer):
+        super().__init__()
+        self.use_res_connect = stride == 1 and in_channels == out_channels
+        self.use_se = use_se
+        self.expand = in_channels != expanded_channels
+
+        if self.expand:
+            self.expand_conv = ConvNormActivation(
+                in_channels=in_channels,
+                out_channels=expanded_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer)
+
+        self.bottleneck_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            groups=expanded_channels,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer)
+
+        if self.use_se:
+            self.mid_se = SqueezeExcitation(
+                expanded_channels,
+                _make_divisible(expanded_channels // 4),
+                scale_activation=nn.Hardsigmoid)
+
+        self.linear_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_layer=norm_layer,
+            activation_layer=None)
+
+    def forward(self, x):
+        identity = x
+        if self.expand:
+            x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.use_res_connect:
+            x = paddle.add(identity, x)
+        return x
+
+
+class MobileNetV3(nn.Layer):
+    """MobileNetV3 model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        config (list[InvertedResidualConfig]): MobileNetV3 depthwise blocks config.
+        last_channel (int): The number of channels on the penultimate layer.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+    """
+
+    def __init__(self,
+                 config,
+                 last_channel,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True):
+        super().__init__()
+
+        self.config = config
+        self.scale = scale
+        self.last_channel = last_channel
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self.firstconv_in_channels = config[0].in_channels
+        self.lastconv_in_channels = config[-1].in_channels
+        self.lastconv_out_channels = self.lastconv_in_channels * 6
+        norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99)
+
+        self.conv = ConvNormActivation(
+            in_channels=3,
+            out_channels=self.firstconv_in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            activation_layer=nn.Hardswish,
+            norm_layer=norm_layer)
+
+        self.blocks = nn.Sequential(*[
+            InvertedResidual(
+                in_channels=cfg.in_channels,
+                expanded_channels=cfg.expanded_channels,
+                out_channels=cfg.out_channels,
+                filter_size=cfg.kernel,
+                stride=cfg.stride,
+                use_se=cfg.use_se,
+                activation_layer=cfg.activation_layer,
+                norm_layer=norm_layer) for cfg in self.config
+        ])
+
+        self.lastconv = ConvNormActivation(
+            in_channels=self.lastconv_in_channels,
+            out_channels=self.lastconv_out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            norm_layer=norm_layer,
+            activation_layer=nn.Hardswish)
+
+        if with_pool:
+            self.avgpool = nn.AdaptiveAvgPool2D(1)
+
+        if num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(self.lastconv_out_channels, self.last_channel),
+                nn.Hardswish(),
+                nn.Dropout(p=0.2),
+                nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.lastconv(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+
+        return x
+
+
+class MobileNetV3Small(MobileNetV3):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Small
+
+            # build model
+            model = MobileNetV3Small(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, True, "relu", 2, scale),
+            InvertedResidualConfig(16, 3, 72, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 88, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 96, 40, True, "hardswish", 2, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 144, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 288, 96, True, "hardswish", 2, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+        ]
+        last_channel = _make_divisible(1024 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+class MobileNetV3Large(MobileNetV3):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Large
+
+            # build model
+            model = MobileNetV3Large(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, False, "relu", 1, scale),
+            InvertedResidualConfig(16, 3, 64, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 72, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 72, 40, True, "relu", 2, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 3, 240, 80, False, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(80, 3, 200, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 480, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 3, 672, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 5, 672, 160, True, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+        ]
+        last_channel = _make_divisible(1280 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs):
+    if arch == "mobilenet_v3_large":
+        model = MobileNetV3Large(scale=scale, **kwargs)
+    else:
+        model = MobileNetV3Small(scale=scale, **kwargs)
+    if pretrained:
+        arch = "{}_x{}".format(arch, scale)
+        assert (
+            arch in model_urls
+        ), "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
+
+
+def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_small
+
+            # build model
+            model = mobilenet_v3_small()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_small(pretrained=True)
+
+            # build mobilenet v3 small model with scale=0.5
+            model = mobilenet_v3_small(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs)
+    return model
+
+
+def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_large
+
+            # build model
+            model = mobilenet_v3_large()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_large(pretrained=True)
+
+            # build mobilenet v3 large model with scale=0.5
+            model = mobilenet_v3_large(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs)
+    return model
diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py
new file mode 100644
index 00000000000..f61d0d601a4
--- /dev/null
+++ b/python/paddle/vision/models/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    """
+    This function ensures that all layers have a channel number that is divisible by divisor
+    You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505
+
+    Args:
+        divisor (int): The divisor for number of channels. Default: 8.
+        min_value (int, optional): The minimum value of number of channels, if it is None,
+                the default is divisor. Default: None.
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 4983ca49ac3..b65bfa502c4 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -17,7 +17,7 @@ from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
 from ..fluid.layers import nn, utils
-from ..nn import Layer
+from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D
 from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
@@ -1297,3 +1297,57 @@ class RoIAlign(Layer):
             output_size=self._output_size,
             spatial_scale=self._spatial_scale,
             aligned=aligned)
+
+
+class ConvNormActivation(Sequential):
+    """
+    Configurable block used for Convolution-Normalzation-Activation blocks.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None,
+            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
+            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d``
+        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
+            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=None,
+                 groups=1,
+                 norm_layer=BatchNorm2D,
+                 activation_layer=ReLU,
+                 dilation=1,
+                 bias=None):
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        if bias is None:
+            bias = norm_layer is None
+        layers = [
+            Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=bias)
+        ]
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            layers.append(activation_layer())
+        super().__init__(*layers)
-- 
GitLab


From 7f43055dfa12831cd467314fe53ae4af65dce662 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Wed, 9 Mar 2022 14:43:42 +0800
Subject: [PATCH 210/272] remove additional deps of phi (#40251)

---
 paddle/fluid/eager/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 8cb69caf663..698a698fc6d 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
@@ -10,11 +10,11 @@ endif()
 add_subdirectory(api)
 add_subdirectory(accumulation)
 
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
-cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
+cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
-- 
GitLab


From 55bfc6cb8372041fee5749902bc00322f965dcdd Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Wed, 9 Mar 2022 14:50:22 +0800
Subject: [PATCH 211/272] [phi] transfer the nll_loss kernel to phi and pass
 the test (#39936)

* transfer the nll_loss_op and pass the CI

* push

* fix by self-review

* fix by cr

* add nll_loss

* fix code
---
 paddle/fluid/operators/nll_loss_op.cc         |   9 +-
 paddle/fluid/operators/nll_loss_op.h          | 306 -----------------
 .../phi/kernels/cpu/nll_loss_grad_kernel.cc   | 171 ++++++++++
 paddle/phi/kernels/cpu/nll_loss_kernel.cc     | 202 +++++++++++
 .../kernels/gpu/nll_loss.h}                   | 316 ++++++------------
 .../phi/kernels/gpu/nll_loss_grad_kernel.cu   | 114 +++++++
 paddle/phi/kernels/gpu/nll_loss_kernel.cu     | 116 +++++++
 paddle/phi/kernels/nll_loss_grad_kernel.h     |  31 ++
 paddle/phi/kernels/nll_loss_kernel.cc         |  41 +++
 paddle/phi/kernels/nll_loss_kernel.h          |  33 ++
 paddle/phi/ops/compat/nll_loss_sig.cc         |  39 +++
 11 files changed, 849 insertions(+), 529 deletions(-)
 delete mode 100644 paddle/fluid/operators/nll_loss_op.h
 create mode 100644 paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/nll_loss_kernel.cc
 rename paddle/{fluid/operators/nll_loss_op.cu => phi/kernels/gpu/nll_loss.h} (50%)
 create mode 100644 paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/nll_loss_kernel.cu
 create mode 100644 paddle/phi/kernels/nll_loss_grad_kernel.h
 create mode 100644 paddle/phi/kernels/nll_loss_kernel.cc
 create mode 100644 paddle/phi/kernels/nll_loss_kernel.h
 create mode 100644 paddle/phi/ops/compat/nll_loss_sig.cc

diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index f510c7bebec..6c35ad29e97 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -264,10 +264,3 @@ REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker,
                   ops::NLLLossGradMaker<paddle::framework::OpDesc>,
                   ops::NLLLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss, ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
deleted file mode 100644
index be6f4422d4a..00000000000
--- a/paddle/fluid/operators/nll_loss_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int64_t i = 0; i < batch_size; ++i) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        out_data[i] = 0;
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                        platform::errors::InvalidArgument(
-                            "Label value is out of range. "
-                            "Expected label value in range of [0, %d), but "
-                            "received value is %d.",
-                            n_classes, cur_label));
-
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int64_t i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      out_data[i] = 0;
-      continue;
-    }
-    PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                      platform::errors::InvalidArgument(
-                          "label should not be out of bounds."));
-
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    total_weight_val += cur_weight;
-    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
-  }
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename T>
-static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const int64_t in_dim2, const int64_t in_dim3,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            out_data[index] = 0;
-            continue;
-          }
-          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                            platform::errors::InvalidArgument(
-                                "label should not be out of bounds."));
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
-                                    h * in_dim3 + w] *
-                            cur_weight;
-        }
-      }
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          out_data[index] = 0;
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                          platform::errors::InvalidArgument(
-                              "label should not be out of bounds."));
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        total_weight_val += cur_weight;
-        output_val -=
-            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
-            cur_weight;
-      }
-    }
-  }
-
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    *total_weight_data = 0;
-
-    auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_1D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, reduction,
-                     ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_2D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, in_dim2, in_dim3,
-                     reduction, ignore_index);
-    }
-  }
-};
-
-template <typename T>
-static void nll_loss_grad_1D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        continue;
-      }
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      continue;
-    }
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
-    if (reduction == "mean") {
-      dx_data[i * n_classes + cur_label] /= total_weight_val;
-    }
-  }
-}
-
-template <typename T>
-static void nll_loss_grad_2D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const int64_t in_dim2, const int64_t in_dim3,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            continue;
-          }
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
-              -cur_weight * dout_data[index];
-        }
-      }
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          continue;
-        }
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        const auto dx_index =
-            i * sample_size + cur_label * map_size + h * in_dim3 + w;
-        dx_data[dx_index] = -dout_val * cur_weight;
-        if (reduction == "mean") {
-          dx_data[dx_index] /= total_weight_val;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-
-    const auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, reduction,
-                       ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, in_dim2,
-                       in_dim3, reduction, ignore_index);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
new file mode 100644
index 00000000000..e7d74759f51
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T>
+static void nll_loss_grad_1D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const std::string reduction,
+                             const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        continue;
+      }
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      continue;
+    }
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
+    if (reduction == "mean") {
+      dx_data[i * n_classes + cur_label] /= total_weight_val;
+    }
+  }
+}
+
+template <typename T>
+static void nll_loss_grad_2D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const int64_t in_dim2,
+                             const int64_t in_dim3,
+                             const std::string& reduction,
+                             const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            continue;
+          }
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
+              -cur_weight * dout_data[index];
+        }
+      }
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          continue;
+        }
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        const auto dx_index =
+            i * sample_size + cur_label * map_size + h * in_dim3 + w;
+        dx_data[dx_index] = -dout_val * cur_weight;
+        if (reduction == "mean") {
+          dx_data[dx_index] /= total_weight_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = d_out.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+  memset(dx_data, 0, dx->numel() * sizeof(T));
+
+  const auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_grad_1D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     reduction,
+                     ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_grad_2D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     in_dim2,
+                     in_dim3,
+                     reduction,
+                     ignore_index);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, CPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
new file mode 100644
index 00000000000..334b0082bde
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+static void nll_loss_1D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int64_t i = 0; i < batch_size; ++i) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        out_data[i] = 0;
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes,
+                            cur_label));
+
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      out_data[i] = 0;
+      continue;
+    }
+    PADDLE_ENFORCE_EQ(
+        cur_label >= 0 && cur_label < n_classes,
+        true,
+        phi::errors::InvalidArgument("label should not be out of bounds."));
+
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    total_weight_val += cur_weight;
+    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
+  }
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T>
+static void nll_loss_2D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const int64_t in_dim2,
+                        const int64_t in_dim3,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            out_data[index] = 0;
+            continue;
+          }
+          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                            true,
+                            phi::errors::InvalidArgument(
+                                "label should not be out of bounds."));
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
+                                    h * in_dim3 + w] *
+                            cur_weight;
+        }
+      }
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          out_data[index] = 0;
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(
+            cur_label >= 0 && cur_label < n_classes,
+            true,
+            phi::errors::InvalidArgument("label should not be out of bounds."));
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        total_weight_val += cur_weight;
+        output_val -=
+            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
+            cur_weight;
+      }
+    }
+  }
+
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& labels,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto x_data = x.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  *total_weight_data = 0;
+
+  auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_1D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   reduction,
+                   ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_2D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   in_dim2,
+                   in_dim3,
+                   reduction,
+                   ignore_index);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, CPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/phi/kernels/gpu/nll_loss.h
similarity index 50%
rename from paddle/fluid/operators/nll_loss_op.cu
rename to paddle/phi/kernels/gpu/nll_loss.h
index fd8a44cc05d..a457264498f 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -1,37 +1,39 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <thrust/functional.h>
 #include <algorithm>
 #include <functional>
 #include <string>
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
+namespace phi {
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 static const int NTHREADS = 32;
-
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                   kNumMaxinumNumBlocks);
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
+__global__ void GPUNLLLossForward1D_no_reduce(T* out_data,
+                                              const T* x_data,
                                               const int64_t* label_data,
                                               const T* weight_data,
                                               const int64_t batch_size,
@@ -51,11 +53,15 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossForward1D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t size_average,
+                                                const int64_t ignore_index) {
   __shared__ T sharedInputs[NTHREADS], sharedWeights[NTHREADS];
   sharedInputs[threadIdx.x] = 0;
   sharedWeights[threadIdx.x] = 0;
@@ -99,9 +105,11 @@ __global__ void GPUNLLLossForward1D_with_reduce(
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp, int N>
-__device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
+__device__ void reduceNValuesInBlock(T* smem,
+                                     T threadVals[N],
                                      const unsigned int numVals,
-                                     ReduceOp reduceOp, T init) {
+                                     ReduceOp reduceOp,
+                                     T init) {
   if (numVals == 0) {
 #pragma unroll
     for (int i = 0; i < N; ++i) {
@@ -175,18 +183,26 @@ __device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp>
-__device__ T reduceBlock(T* smem, const unsigned int numVals, T threadVal,
-                         ReduceOp reduceOp, T init) {
-  reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp,
-                                       init);
+__device__ T reduceBlock(T* smem,
+                         const unsigned int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  reduceNValuesInBlock<T, ReduceOp, 1>(
+      smem, &threadVal, numVals, reduceOp, init);
   return threadVal;
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_no_reduce(
-    T* out_data, const T* x_data, const int64_t* label_data,
-    const T* weight_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_no_reduce(T* out_data,
+                                              const T* x_data,
+                                              const int64_t* label_data,
+                                              const T* weight_data,
+                                              const int64_t batch_size,
+                                              const int64_t n_classes,
+                                              const int64_t in_dim2,
+                                              const int64_t in_dim3,
+                                              const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -211,11 +227,16 @@ __global__ void GPUNLLLossForward2D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t map_nelem,
+                                                const int64_t blocks_per_sample,
+                                                const int64_t ignore_index) {
   __shared__ T partial_sums[kNumCUDAThreads];
   int64_t i;
   T input_sum = 0;
@@ -228,7 +249,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   int64_t ioffset = sample * map_nelem * n_classes;
   int64_t step = blockDim.x * blocks_per_sample;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
@@ -242,8 +264,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   input_sum =
       reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<T>(), (T)0);
   __syncthreads();
-  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight,
-                           thrust::plus<T>(), (T)0);
+  acc_weight = reduceBlock(
+      partial_sums, blockDim.x, acc_weight, thrust::plus<T>(), (T)0);
 
   if (threadIdx.x == 0) {
     paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight);
@@ -258,12 +280,14 @@ __global__ void GPUNLLLossForward2D_size_average(T* out_data,
     *out_data /= *total_weight_data;
   }
 }
-
 template <typename T>
-__global__ void GPUNLLLossBackward1D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t ignore_index) {
   CUDA_KERNEL_LOOP(i, batch_size) {
     const int64_t cur_label = label_data[i];
     if (cur_label == ignore_index) {
@@ -275,11 +299,15 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward1D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_with_reduce(T* dx_data,
+                                                 const T* total_weight_data,
+                                                 const int64_t* label_data,
+                                                 const T* weight_data,
+                                                 const T* dout_data,
+                                                 const int64_t batch_size,
+                                                 const int64_t n_classes,
+                                                 const int64_t size_average,
+                                                 const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
   }
@@ -295,10 +323,15 @@ __global__ void GPUNLLLossBackward1D_with_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward2D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward2D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t in_dim2,
+                                               const int64_t in_dim3,
+                                               const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -319,10 +352,16 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
 
 template <typename T>
 __global__ void GPUNLLLossBackward2D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t size_average,
+    T* dx_data,
+    const T* total_weight_data,
+    const int64_t* label_data,
+    const T* weight_data,
+    const T* dout_data,
+    const int64_t batch_size,
+    const int64_t n_classes,
+    const int64_t map_nelem,
+    const int64_t blocks_per_sample,
+    const int64_t size_average,
     const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
@@ -334,7 +373,8 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   int toffset = sample * map_nelem;
   int ioffset = sample * map_nelem * n_classes;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       dx_data[ioffset + i + map_nelem * cur_label] =
@@ -343,158 +383,4 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   }
 }
 
-template <typename DeviceContext, typename T>
-class NLLLossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(total_weight_data, 0, sizeof(T));
-#else
-    cudaMemset(total_weight_data, 0, sizeof(T));
-#endif
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-    int64_t size_average = (int64_t)(reduction == "mean");
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossForward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossForward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, map_size, blocks_per_sample, ignore_index);
-        if (size_average) {
-          GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
-              out_data, total_weight_data);
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NLLLossGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-#ifdef PADDLE_WITH_HIP
-    hipMemset(dx_data, 0, dx->numel() * sizeof(T));
-#else
-    cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
-#endif
-
-    int64_t size_average = (int64_t)(reduction == "mean");
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossBackward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossBackward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, map_size, blocks_per_sample, size_average,
-            ignore_index);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
new file mode 100644
index 00000000000..9a2d9c6e479
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& dout,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = dout.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+#ifdef PADDLE_WITH_HIP
+  hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#else
+  cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
+#endif
+
+  int64_t size_average = (int64_t)(reduction == "mean");
+  auto x_dims = x.dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossBackward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          dx_data,
+          total_weight_data,
+          label_data,
+          weight_data,
+          dout_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossBackward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                             total_weight_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             dout_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             size_average,
+                                                             ignore_index);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, GPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
new file mode 100644
index 00000000000..6b0e1fef7ba
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto* x = &input;
+  auto x_data = x->data<T>();
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  auto label_data = label.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(total_weight_data, 0, sizeof(T));
+#else
+  cudaMemset(total_weight_data, 0, sizeof(T));
+#endif
+  auto x_dims = x->dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+  int64_t size_average = (int64_t)(reduction == "mean");
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossForward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          out_data,
+          total_weight_data,
+          x_data,
+          label_data,
+          weight_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossForward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                             total_weight_data,
+                                                             x_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             ignore_index);
+      if (size_average) {
+        GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
+            out_data, total_weight_data);
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, GPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h
new file mode 100644
index 00000000000..127dc2f961f
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& label,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc
new file mode 100644
index 00000000000..b271f0f4d06
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   paddle::optional<const DenseTensor&> weight,
+                   int64_t ignore_index,
+                   const std::string& reduction,
+                   DenseTensor* out) {
+  DenseTensor total_weight;
+  total_weight.set_meta(
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(), {1}));
+  dev_ctx.template Alloc<T>(total_weight);
+  NllLossRawKernel(dev_ctx,
+                   input,
+                   label,
+                   weight,
+                   ignore_index,
+                   reduction,
+                   out,
+                   &total_weight);
+}
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h
new file mode 100644
index 00000000000..90083e1d684
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.h
@@ -0,0 +1,33 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc
new file mode 100644
index 00000000000..f274d7f77c5
--- /dev/null
+++ b/paddle/phi/ops/compat/nll_loss_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  // TODO(xiongkun): can't remove the forward mapping, because the Weight is
+  // optional
+  return KernelSignature("nll_loss",
+                         {"X", "Label", "Weight"},
+                         {"ignore_index", "reduction"},
+                         {"Out", "Total_weight"});
+}
+
+KernelSignature NllLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nll_loss_grad",
+      {"X", "Label", "Total_weight", "Weight", GradVarName("Out")},
+      {"ignore_index", "reduction"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping);
-- 
GitLab


From 7b18c55b1663bc3fc25818c7ecaf0c3e143ed352 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Wed, 9 Mar 2022 15:30:12 +0800
Subject: [PATCH 212/272] fix the document of ones_like, zeros_like (#40233)

---
 python/paddle/tensor/creation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index bddc45bc961..6555ba0812d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -283,7 +283,7 @@ def ones_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
@@ -358,7 +358,7 @@ def zeros_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
-- 
GitLab


From 3e9601ba4943b36da375fdf50238474da760abab Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Wed, 9 Mar 2022 15:50:42 +0800
Subject: [PATCH 213/272] adapt run_program OP for eager (#40198)

* adapt run_program OP for eager

* fix program_id

* refine code

* fix test
---
 .../auto_code_generator/eager_generator.cc    |  12 +-
 .../final_state_generator/eager_gen.py        |   2 +
 .../eager/to_static/run_program_op_func.h     |  82 +++
 .../eager/to_static/run_program_op_node.h     | 468 ++++++++++++++++++
 .../fluid/pybind/custom_handwrite_op_funcs.h  |  51 ++
 .../pybind/eager_op_function_generator.cc     |  24 +-
 paddle/fluid/pybind/eager_utils.cc            |  60 +++
 paddle/fluid/pybind/eager_utils.h             |   7 +
 paddle/fluid/pybind/pybind.cc                 |   7 +-
 .../tests/unittests/test_eager_run_program.py | 120 +++++
 10 files changed, 823 insertions(+), 10 deletions(-)
 create mode 100644 paddle/fluid/eager/to_static/run_program_op_func.h
 create mode 100644 paddle/fluid/eager/to_static/run_program_op_node.h
 create mode 100644 paddle/fluid/pybind/custom_handwrite_op_funcs.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_run_program.py

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 2fc846cccc2..dc79a8a45a2 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
+/* --- Black Ops list that's NO NEED to apply code generation --- */
+static std::unordered_set<std::string> black_ops_list = {"run_program"};
+
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
   std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
@@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type,
 }
 
 static void PrepareAttrMapForOps() {
-  // Handle "run_program_op"
-  static framework::ProgramDesc fake_prog;
-  operators_with_attrs["run_program"] = {};
-  operators_with_attrs["run_program"]["global_block"] =
-      fake_prog.MutableBlock(0);
-
   // Handle "fused_elemwise_add_activation"
   std::vector<std::string> functor_list = {"a", "b"};
   operators_with_attrs["fused_elemwise_add_activation"] = {};
@@ -2349,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     if (!CheckOpProto(op_proto)) continue;
     const std::string& op_type = op_proto->type();
+    if (black_ops_list.count(op_type)) {
+      continue;
+    }
 
     /* ----------------------------- */
     /* ---- Collect Information ---- */
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 81d0c9b7bed..b594faa80a8 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -1000,6 +1000,7 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
 """
     file_contents += node_definition_str
@@ -1042,6 +1043,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 #include "paddle/phi/api/all.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 
 """
     file_contents += GenerateCoreOpInfoDeclaration()
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
new file mode 100644
index 00000000000..6f8bccd64e4
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/utils.h"
+
+inline void run_program_dygraph_function(
+    const std::vector<paddle::experimental::Tensor>& x,
+    const std::vector<paddle::experimental::Tensor>& params,
+    std::vector<paddle::experimental::Tensor*>& out,     // NOLINT
+    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor*>& dout,    // NOLINT
+    const paddle::framework::AttributeMap& attrs) {
+  VLOG(2) << "start run run_program";
+  // Call forward function
+  RunProgramAPI(x, params, out, step_scope, dout, attrs);
+  VLOG(2) << "start run run_program grad";
+
+  // Prepare Autograd Meta
+  auto deref_out = details::DereferenceTensors(out);
+  std::vector<egr::AutogradMeta*> p_autograd_x =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*> p_autograd_params =
+      egr::EagerUtils::nullable_autograd_meta(params);
+  std::vector<egr::AutogradMeta*> p_autograd_outs =
+      egr::EagerUtils::nullable_autograd_meta(deref_out);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, &p_autograd_x, &p_autograd_params);
+
+  if (require_any_grad) {
+    std::vector<std::string> out_names;
+    for (auto& t : deref_out) {
+      out_names.emplace_back(t.name());
+    }
+
+    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
+    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
+    auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
+
+    grad_node->SetFwdOutNames(out_names);
+    // Set Attributes
+    grad_node->SetAttrMap(attrs);
+    // Set TensorWrappers
+    grad_node->SetFwdX(x);
+    grad_node->SetFwdParams(params);
+    grad_node->SetStepScope(step_scope);
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+
+    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    // Set Next Edges
+    grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
+    grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
+
+    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
+
+    // Set History for output set current Grad Node for
+    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
+    egr::EagerUtils::CheckAndRetainGrad(deref_out);
+  }
+}
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
new file mode 100644
index 00000000000..ae5d86664a3
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -0,0 +1,468 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
+#include "paddle/fluid/operators/run_program_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace details {
+using Tensor = paddle::experimental::Tensor;
+
+static std::vector<Tensor> DereferenceTensors(
+    const std::vector<Tensor *> &tensor_ptr) {
+  std::vector<Tensor> res;
+  for (auto *t : tensor_ptr) {
+    res.emplace_back(*t);
+  }
+  return res;
+}
+
+static std::vector<std::string> GetTensorsName(const std::vector<Tensor> &ins) {
+  std::vector<std::string> in_names;
+  for (auto &in_t : ins) {
+    in_names.emplace_back(in_t.name());
+  }
+  return in_names;
+}
+
+static std::vector<std::string> GetTensorsName(
+    const std::vector<Tensor *> &ins) {
+  std::vector<std::string> in_names;
+  for (auto *in_t : ins) {
+    in_names.emplace_back(in_t->name());
+  }
+  return in_names;
+}
+
+static void CheckInputVarStatus(const Tensor &tensor) {
+  PADDLE_ENFORCE_EQ(
+      tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
+      paddle::platform::errors::InvalidArgument(
+          "The input tensor %s of "
+          "RunProgram(Grad)Op holds "
+          "wrong type. Expect type is DenseTensor.",
+          tensor.name()));
+
+  PADDLE_ENFORCE_EQ(tensor.initialized(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "The tensor in input tensor %s of "
+                        "RunProgram(Grad)Op "
+                        "is not initialized.",
+                        tensor.name()));
+}
+
+static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
+                                 const Tensor &dst_tensor) {
+  auto name = dst_tensor.name();
+  PADDLE_ENFORCE_EQ(dst_tensor.defined(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "dst_tensor shall be defined."));
+
+  if (phi::DenseTensor::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::DenseTensor>();
+    PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensor %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is DenseTensor",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's internal "
+                          "scope is not initialized.",
+                          name));
+  } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::SelectedRows>();
+    PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensodfr %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is SelectedRows",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's "
+                          "internal scope is not initialized.",
+                          name));
+
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The RunProgram(Grad)Op only support output "
+        "variable of type LoDTensor or SelectedRows",
+        name));
+  }
+}
+
+static void ShareTensorsIntoScope(const std::vector<Tensor> &tensors,
+                                  paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto name = tensors[i].name();
+    if (name == "Fake_var" || !tensors[i].is_initialized()) {
+      continue;
+    }
+    auto *var = scope->Var(name);
+    CheckInputVarStatus(tensors[i]);
+    // share tensor
+    auto tensor_base = tensors[i].impl();
+    if (phi::DenseTensor::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::DenseTensor>();
+      auto t = std::dynamic_pointer_cast<phi::DenseTensor>(tensor_base);
+      *dst_tensor = *t;
+    } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::SelectedRows>();
+      auto t = std::dynamic_pointer_cast<phi::SelectedRows>(tensor_base);
+      *dst_tensor = *t;
+    }
+  }
+}
+
+static void ShareTensorsFromScope(
+    const std::vector<Tensor *> &tensors,
+    const paddle::framework::BlockDesc &global_block,
+    paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all
+    // parameters before generating out_tmp have no @GRAD, it will raise error
+    // because we can't find them in scope. So we skip sharing these vars or
+    // var@GRAD if they don't appear in global block.
+    auto &name = tensors[i]->name();
+    if (name == paddle::framework::kEmptyVarName || name == "Fake_var" ||
+        !global_block.HasVar(name)) {
+      VLOG(2) << "find tensor name is " << name << ", skip it!";
+      continue;
+    }
+    // NOTE: Here skip not found var is dangerous, if a bug is caused here,
+    // the result is grad calculation error, which will be very hidden!
+    auto *var = scope->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound(
+                                     "The output tensor %s is not in "
+                                     "RunProgram(Grad)Op'"
+                                     "s internal scope.",
+                                     name));
+    CheckOutputVarStatus(*var, *tensors[i]);
+    // share tensor
+    // TODO(dev): Determine Tensor type by scope.var
+    // auto tensor_base = tensors[i]->impl();
+    // if (phi::DenseTensor::classof(tensor_base.get())) {
+    if (var->IsType<phi::DenseTensor>()) {
+      auto &src_tensor = var->Get<phi::DenseTensor>();
+      auto *dst_tensor = const_cast<phi::DenseTensor *>(
+          dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
+      VLOG(2) << "share " << name << " from scope";
+      *dst_tensor = src_tensor;
+    } else if (var->IsType<phi::SelectedRows>()) {
+      // } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto &src_tensor = var->Get<phi::SelectedRows>();
+      auto *dst_tensor = const_cast<phi::SelectedRows *>(
+          dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
+      *dst_tensor = src_tensor;
+    }
+  }
+}
+
+}  // namespace details
+
+inline void RunProgramAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    std::vector<paddle::experimental::Tensor *> &out,     // NOLINT
+    std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor *> &dout,    // NOLINT
+    const paddle::framework::AttributeMap &attrs) {
+  VLOG(2) << "RunProgramOpKernel Compute";
+  auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index"));
+  auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+  auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test"));
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+
+  // NOTE(chenweihang): In order not to add new variable type, use vector
+  // here. Originally, here can use scope directly.
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  // Step 2. prepare executor and init persistable variables
+
+  // NOTE(Aurelius84): While training some models, forward can be called many
+  // times and then apply backpropagation all at once, such as Reinforcement
+  // Learning. Tensor data in multi-step training should be saved into single
+  // scope separately. Otherwise, the gradients can be miscalculated because
+  // always using the Tensor data of the last step in forward.
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  VLOG(2) << "The number of sub scopes before forward: "
+          << out_scope_vec->front()->kids().size();
+  paddle::framework::Scope &scope = global_inner_scope->NewScope();
+
+  // share input_vars & parameters into scope
+  details::ShareTensorsIntoScope(x, &scope);
+  details::ShareTensorsIntoScope(params, &scope);
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto input_names = details::GetTensorsName(x);
+    auto output_names = details::GetTensorsName(out);
+    auto dout_names = details::GetTensorsName(dout);
+    auto *program = global_block->Program();
+
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad=*/false, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+    // all out_vars are skip_eager_var
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, false);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names);
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    output_names.begin(), output_names.end());
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    dout_names.begin(), dout_names.end());
+      paddle::framework::details::ParseSafeEagerDeletionSkipVars(
+          *program, end_op_index, output_names, &skip_eager_delete_vars);
+    }
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
+  }
+  // Step 4. Get Output
+  details::ShareTensorsFromScope(out, *global_block, &scope);
+  details::ShareTensorsFromScope(dout, *global_block, &scope);
+
+  // Debug info: scope info when run end
+  VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+  // Step 5. Drop all children scopes while testing.
+  if (is_test) {
+    out_scope_vec->front()->DropKids();
+  }
+  VLOG(2) << "The number of sub scopes after forward: "
+          << out_scope_vec->front()->kids().size();
+  // #ifdef PADDLE_WITH_MKLDNN
+  //     if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+  // #endif
+}
+
+inline void RunProgramGradAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    const std::vector<paddle::experimental::Tensor> &out_grad,
+    const std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    const paddle::framework::AttributeMap &attrs,
+    std::vector<paddle::experimental::Tensor *> &x_grad,      // NOLINT
+    std::vector<paddle::experimental::Tensor *> &params_grad  // NOLINT
+    ) {
+  // if all output vars are set to stop_gradient, grad op no need to executed
+  if (x_grad.empty() && params_grad.empty()) return;
+
+  // TODO(dev): Remove this line hard code. And need to deal with the out_grad
+  // name problem.
+  // const_cast<paddle::experimental::Tensor &>(out_grad[0])
+  //     .set_name("matmul_v2_0.tmp_0@GRAD");
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+  // NOTE: skip `shape` and `fill_constant` op created by
+  // fluid.backward.gradients, one forward output will generate one `shape`
+  // and `fill_constant`
+  int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2);
+  int64_t end_op_index = global_block->OpSize();
+
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  auto sub_scope_num = global_inner_scope->kids().size();
+  VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+  PADDLE_ENFORCE_GT(sub_scope_num, 0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The OutScope of RunProgramGradOp should hold at "
+                        "least one sub scope."));
+
+  auto &scope = *(global_inner_scope->kids().front());
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto out_grad_names = details::GetTensorsName(out_grad);
+    // NOTE: after PR22939 [Add double grad] merged, the grad op maker's
+    //   SetOutput will set to None if the input var stop_gradient=True,
+    //   it will cause an NotFound error when ctx.OutputNames() is called
+    std::vector<std::string> x_grad_names;
+    std::vector<std::string> param_grad_names;
+    if (!x_grad.empty()) {
+      x_grad_names = details::GetTensorsName(x_grad);
+    }
+    if (!params_grad.empty()) {
+      param_grad_names = details::GetTensorsName(params_grad);
+    }
+
+    // Step 2. prepare executor and scope
+    auto *program = global_block->Program();
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad*/ true, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, true);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names);
+
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    x_grad_names.begin(), x_grad_names.end());
+      paddle::framework::details::AppendSkipDeletionVars(
+          param_grad_names, &skip_eager_delete_vars);
+    }
+
+    details::ShareTensorsIntoScope(out_grad, &scope);
+    // Debug info: scope info when run end
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(
+        /*skip_eager_delete_vars=*/skip_eager_delete_vars);
+  }
+
+  // Step 4. get outputs
+  details::ShareTensorsFromScope(x_grad, *global_block, &scope);
+  details::ShareTensorsFromScope(params_grad, *global_block, &scope);
+
+  // Step5. drop current scope
+  // global_inner_scope->DeleteScope(&scope);
+  VLOG(2) << "The number of sub scopes after backward: "
+          << global_inner_scope->kids().size();
+}
+
+class GradNodeRunProgram : public egr::GradNodeBase {
+ public:
+  GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+
+  ~GradNodeRunProgram() override = default;
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
+      override {
+    VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
+    PADDLE_ENFORCE_EQ(
+        grads.size(), 1,
+        paddle::platform::errors::InvalidArgument(
+            "The out_grads.size() of RunProgramGradOp should be equal to 1."));
+
+    VLOG(3) << "out_grads[0].size() : " << grads[0].size();
+    std::vector<paddle::experimental::Tensor> x_grad;
+    std::vector<paddle::experimental::Tensor> params_grad;
+    ConstructGradTensors(x_, &x_grad);
+    ConstructGradTensors(params_, &params_grad);
+    std::vector<paddle::experimental::Tensor *> x_grad_ptr;
+    std::vector<paddle::experimental::Tensor *> params_grad_ptr;
+    for (auto &i : x_grad) {
+      x_grad_ptr.emplace_back(&i);
+    }
+    for (auto &i : params_grad) {
+      params_grad_ptr.emplace_back(&i);
+    }
+
+    // auto x_grad_ptr = ConstructGradTensors(x_);
+    // auto params_grad_ptr = ConstructGradTensors(params_);
+
+    PADDLE_ENFORCE_EQ(
+        grads[0].size(), fwd_out_names_.size(),
+        paddle::platform::errors::InvalidArgument(
+            "The grads[0].size() and fwd_out_names_.size() should be equal."));
+    for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
+      const_cast<paddle::experimental::Tensor &>(grads[0][i])
+          .set_name(fwd_out_names_[i] + "@GRAD");
+    }
+
+    RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr,
+                      params_grad_ptr);
+    VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
+    return {x_grad, params_grad};
+    // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
+  }
+
+  // SetAttrMap
+  void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
+    attrs_ = attrs;
+  }
+
+  void SetFwdX(const std::vector<paddle::experimental::Tensor> &tensors) {
+    x_ = tensors;
+  }
+
+  void SetFwdParams(const std::vector<paddle::experimental::Tensor> &tensors) {
+    params_ = tensors;
+  }
+
+  void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
+    step_scope_ = scopes;
+  }
+
+  void SetFwdOutNames(std::vector<std::string> out_names) {
+    fwd_out_names_ = out_names;
+  }
+
+ protected:
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors,
+      std::vector<paddle::experimental::Tensor> *grad_tensors) {
+    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // such as: name, tensor type(DenseTensor or SelectedRows).
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      grad_tensors->emplace_back(fwd_t.impl());
+      auto &grad_t = grad_tensors->back();
+      grad_t.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors) {
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad();
+      grad_tesnor.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+ private:
+  // TensorWrappers
+  std::vector<paddle::experimental::Tensor> x_;
+  std::vector<paddle::experimental::Tensor> params_;
+  std::vector<paddle::framework::Scope *> step_scope_;
+
+  std::vector<std::string> fwd_out_names_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attrs_;
+};
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
new file mode 100644
index 00000000000..7a276df0d5b
--- /dev/null
+++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+
+static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
+                                       PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args),
+                               attrs);
+
+    tstate = PyEval_SaveThread();
+    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    std::cout << "end run_program_dygraph_function" << std::endl;
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+  }
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef CustomEagerFinalStateMethods[] = {
+    {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
+
+    {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index c15c171799f..102cdbb91ab 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <set>
 #include <string>
+#include <unordered_set>
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -129,6 +130,12 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
 
+// These operators will skip automatical code generatrion and
+// need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE
+std::unordered_set<std::string> CUSTOM_HANDWRITE_OPS_SET = {"run_program"};
+const char* CUSTOM_HANDWRITE_OP_FUNC_FILE =
+  "#include \"paddle/fluid/pybind/custom_handwrite_op_funcs.h\"\n";
+
 // clang-format on
 static inline bool FindInsMap(const std::string& op_type,
                               const std::string& in_name) {
@@ -355,7 +362,7 @@ GenerateOpFunctions() {
 
   std::vector<std::string> op_function_list, bind_function_list;
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-
+  bool append_custom_head_file = false;
   for (auto& pair : op_info_map) {
     auto& op_info = pair.second;
     auto op_proto = op_info.proto_;
@@ -363,7 +370,12 @@ GenerateOpFunctions() {
       continue;
     }
     auto& op_type = op_proto->type();
-    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE.
+    if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) {
+      append_custom_head_file = true;
+      continue;
+    }
+    // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
     // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
@@ -380,6 +392,9 @@ GenerateOpFunctions() {
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
   }
+  if (append_custom_head_file) {
+    op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
+  }
   return std::make_tuple(op_function_list, bind_function_list);
 }
 
@@ -449,6 +464,11 @@ int main(int argc, char* argv[]) {
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < "
+         "0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
       << "}\n\n"
       << "} // namespace pybind\n"
       << "} // namespace paddle\n";
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 0cfb08345b6..f4e148cf8dc 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
@@ -35,6 +36,7 @@ namespace pybind {
 
 extern PyTypeObject* p_tensor_type;
 
+extern PyTypeObject* g_framework_scope_pytype;
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_place_pytype;
 extern PyTypeObject* g_cudaplace_pytype;
@@ -830,6 +832,64 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray(
   return paddle::experimental::ScalarArray({1});
 }
 
+paddle::framework::Scope* CastPyArg2ScopePtr(PyObject* obj) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_framework_scope_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::framework::Scope*>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "PyObject can not be cast into framework::Scope"));
+  }
+}
+
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+  }
+
+  std::vector<paddle::framework::Scope*> result;
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (list == Py_None) {
+    return {};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+  return result;
+}
+
 paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
                                                 const std::string& op_type,
                                                 ssize_t arg_pos) {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index c5da1bb37af..966a920377b 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
+namespace framework {
+class Scope;
+}
+
 namespace pybind {
 
 typedef struct {
@@ -134,6 +138,9 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     ssize_t arg_idx, bool dispensable = false);
 
 // end of Slice related methods
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index fcfc3e6a379..566e38b7a21 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -175,6 +175,7 @@ namespace paddle {
 namespace pybind {
 
 PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_framework_scope_pytype = nullptr;
 PyTypeObject *g_cudaplace_pytype = nullptr;
 PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
@@ -1352,7 +1353,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindReader(&m);
 
-  py::class_<Scope>(m, "_Scope", R"DOC(
+  py::class_<Scope> _Scope(m, "_Scope", R"DOC(
     Scope is an association of a name to Variable. All variables belong to Scope.
 
     Variables in a parent scope can be retrieved from local scope.
@@ -1372,7 +1373,9 @@ All parameter, weight, gradient are variables in Paddle.
           param_array = np.full((height, row_numel), 5.0).astype("float32")
           param.set(param_array, place)
 
-        )DOC")
+        )DOC");
+  g_framework_scope_pytype = reinterpret_cast<PyTypeObject *>(_Scope.ptr());
+  _Scope
       .def("_remove_from_pool",
            [](Scope &self) { ScopePool::Instance().Remove(&self); })
       .def("var",
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
new file mode 100644
index 00000000000..fc6a5d60eca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard, Variable
+from paddle.fluid import core
+from paddle.fluid.layers.utils import _hash_with_id
+import paddle.compat as cpt
+
+import unittest
+
+
+def _append_backward_desc(main_program, outs):
+    # make sure all status of is_test are False in train mode.
+    program = main_program.clone()
+    targets = []
+    for out in outs:
+        if isinstance(out, Variable):
+            targets.append(program.global_block().var(out.name))
+
+    if targets:
+        paddle.fluid.backward.gradients(targets=targets, inputs=[])
+
+    return program
+
+
+# def _set_grad_type(params, train_program):
+#     # NOTE: if user set sparse gradient mode, the param's gradient
+#     # will be SelectedRows, not LoDTensor. But tracer will just
+#     # set param grad VarBase by forward VarBase(LoDTensor)
+#     # If we don't change grad_var type here, RunProgramOp need
+#     # transform SelectedRows to LoDTensor forcibly, it may not
+#     # be user wanted result.
+#     for param in params:
+#         grad_name = param.name + core.grad_var_suffix()
+#         grad_var = train_program.desc.block(0).find_var(
+#             cpt.to_bytes(grad_name))
+#         # NOTE: cannot find var desc maybe no problem, such as in batch_norm
+#         if grad_var is None:
+#             continue
+#         param._set_grad_type(grad_var.type())
+
+
+def _create_out(var):
+    assert isinstance(var, Variable)
+    var_desc = var.desc
+    varbase = None
+    if not core._in_eager_mode():
+        var_base = core.VarBase(var_desc.dtype(),
+                                var_desc.shape(),
+                                var_desc.name(), var_desc.type(), False)
+    else:
+        var_base = core.eager.Tensor(var_desc.dtype(),
+                                     var_desc.shape(),
+                                     var_desc.name(), var_desc.type(), False)
+    return var_base
+
+
+class TestRunProgram(unittest.TestCase):
+    def test_eager(self):
+        paddle.set_device('cpu')
+        paddle.enable_static()
+        # step 1: construct program
+        x = paddle.static.data(shape=[2, 4], name='x')
+        x.stop_gradient = False
+        y = paddle.static.data(shape=[4, 2], name='y')
+        y.stop_gradient = False
+        out = paddle.matmul(x, y)
+
+        main_program = paddle.static.default_main_program()
+        program = _append_backward_desc(main_program, [out])
+
+        paddle.disable_static('cpu')
+        # step 2: call run_program in eager mode
+        with _test_eager_guard():
+            x_t = paddle.ones([2, 4])
+            x_t.name = "x"
+            x_t.stop_gradient = False
+            y_t = paddle.ones([4, 2])
+            y_t.name = "y"
+            y_t.stop_gradient = False
+
+            fake_var = paddle.zeros([1])
+            fake_var.name = 'Fake_var'
+
+            out_t = _create_out(out)
+
+            scope = core.Scope()
+            attrs = ('global_block', program.desc.block(0), 'start_op_index', 0,
+                     'end_op_index', main_program.desc.block(0).op_size(),
+                     'is_test', False, 'program_id', _hash_with_id(program))
+
+            _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
+                               [fake_var], *attrs)
+
+            loss = paddle.mean(out_t)
+            loss.backward()
+
+            self.assertTrue(np.array_equal(np.ones([2, 2]) * 4, out_t.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([2, 4]) * 0.5, x_t.grad.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([4, 2]) * 0.5, y_t.grad.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 60b86b2ffb5fd442277fe81d7b1846a50eb2b599 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Wed, 9 Mar 2022 16:32:47 +0800
Subject: [PATCH 214/272] Sparse Conv3d gpu backward (#40143)

Sparse conv3d backward(gpu)
---
 .../kernels/sparse/convolution_grad_kernel.h  |   6 +-
 .../phi/kernels/sparse/convolution_kernel.h   |  18 +-
 paddle/phi/kernels/sparse/cpu/convolution.h   |   5 -
 .../sparse/cpu/convolution_grad_kernel.cc     |  11 +-
 .../kernels/sparse/cpu/convolution_kernel.cc  |   5 -
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 139 +++++++++++
 .../sparse/gpu/convolution_grad_kernel.cu     | 217 ++++++++++++++++++
 .../kernels/sparse/gpu/convolution_kernel.cu  | 172 +++-----------
 .../kernels/test_sparse_conv3d_dev_api.cc     |  72 +++---
 9 files changed, 430 insertions(+), 215 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/gpu/convolution.cu.h
 create mode 100644 paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu

diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index 3ada3473355..f4265d303d7 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -45,8 +45,10 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
                                     const int groups) {
-  DenseTensor x_grad = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor kernel_grad = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor x_grad =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  DenseTensor kernel_grad = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout()));
   // TODO(zhangkaihuo): call InferMeta func here
   Conv3dGradKernel<T, Context>(dev_ctx,
                                x,
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 1c1e62c8306..cfb451afdcb 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -20,18 +20,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
-
-template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx) {
-  phi::DenseTensor dense_out(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      {paddle::experimental::CppTypeToDataType<T>::Type(),
-       {-1},
-       DataLayout::NCHW});
-  return dense_out;
-}
-
 namespace sparse {
 
 struct Dims4D {
@@ -149,8 +137,10 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        DenseTensor* rulebook) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
   Conv3dKernel<T, Context>(
       dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index 1031f769179..bcb6db40788 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -45,9 +45,6 @@ void ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
-  dev_ctx.Alloc(counter_per_kernel,
-                counter_per_kernel->dtype(),
-                sizeof(int) * counter_per_kernel->numel());
   int* counter_ptr = counter_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
@@ -138,8 +135,6 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
       x.dtype(), {out_non_zero_num, out_channels}, x.layout());
   phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
   phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-  dev_ctx.Alloc(
-      &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int));
   int* out_indices_ptr = out_indices.data<int>();
   int i = 0;
   for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index cb6cf435435..6ee265a3296 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
@@ -60,15 +61,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
   phi::DenseTensor out_grad_features =
       phi::Empty(dev_ctx, std::move(out_grad_features_meta));
 
-  dev_ctx.Alloc(
-      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
   T* in_features_ptr = in_features.data<T>();
-  dev_ctx.Alloc(
-      &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel());
   T* d_x_features_ptr = d_x_features.data<T>();
-  dev_ctx.Alloc(&out_grad_features,
-                out_grad_features.dtype(),
-                sizeof(T) * out_grad_features.numel());
   T* out_grad_features_ptr = out_grad_features.data<T>();
   kernel_grad->Resize(kernel_dims);
   dev_ctx.Alloc(
@@ -156,12 +150,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv_grad,
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::Conv3dGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-  kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 93397d4c931..64ef068e03a 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -81,8 +81,6 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(in_features_meta));
   phi::DenseTensor out_features =
       phi::Empty(dev_ctx, std::move(out_features_meta));
-  dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel());
-  dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel());
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
 
@@ -128,9 +126,6 @@ void Conv3dKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  dev_ctx.Alloc(out->mutable_non_zero_elements(),
-                out->mutable_non_zero_elements()->dtype(),
-                sizeof(T) * in_features.numel());
   T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
   memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
   Scatter<T>(out_features_ptr,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
new file mode 100644
index 00000000000..03a6aaa6894
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+template <typename Context>
+inline int* SortedAndUniqueIndex(const Context& dev_ctx,
+                                 const int* rulebook_ptr,
+                                 const int len,
+                                 DenseTensor* out_index,
+                                 DenseTensor* unique_key,
+                                 DenseTensor* unique_value) {
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, out_index, kps::IdentityFunctor<int>());
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, unique_value, kps::IdentityFunctor<int>());
+
+  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<int>(),
+                                     rulebook_ptr,
+                                     sizeof(int) * len,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToDevice,
+#else
+                                     cudaMemcpyDeviceToDevice,
+#endif
+                                     dev_ctx.stream());
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key->data<int>(),
+                      unique_key->data<int>() + len,
+                      out_index->data<int>());
+
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key->data<int>(),
+                            unique_key->data<int>() + len,
+                            unique_value->data<int>());
+  return new_end.first;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
new file mode 100644
index 00000000000..861f18f36e6
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -0,0 +1,217 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook[3, rulebook_len]:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  T* in_features_ptr = in_features.data<T>();
+  T* d_x_features_ptr = d_x_features.data<T>();
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->Resize(kernel_dims);
+  dev_ctx.Alloc(
+      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(
+      out_grad.non_zero_elements().data<T>(),
+      rulebook_ptr + rulebook_len * 2,
+      out_grad_features_ptr,
+      rulebook_len,
+      out_channels);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(rulebook_len, 0);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  int offset = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[kernel_size] = offset;
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+
+  DenseTensor out_index = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  DenseTensor unique_key = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  DenseTensor unique_value = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+  SortedAndUniqueIndex(dev_ctx,
+                       rulebook_ptr + rulebook_len,
+                       rulebook_len,
+                       &out_index,
+                       &unique_key,
+                       &unique_value);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(d_x_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         x.nnz(),
+                                         rulebook_len,
+                                         in_channels,
+                                         x_grad_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index aeb9409c417..4a533d9d1d5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
-#include "glog/logging.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -28,19 +27,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
 namespace phi {
 namespace sparse {
 
-// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
-__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
-    out1[i] = i;
-    out2[i] = i;
-  }
-}
-
 /**
  * @brief: update the out index and indices
  * unique_keys: save the index of the output feature list
@@ -124,7 +115,7 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
           int in_z = x_indices[i + non_zero_num];
           int in_y = x_indices[i + 2 * non_zero_num];
           int in_x = x_indices[i + 3 * non_zero_num];
-          int in_i = -1, out_index = -1;
+          int in_i = -1, out_index = -1, kernel_i = -1;
           if (Check(x_dims,
                     kernel_dims,
                     paddings,
@@ -143,9 +134,11 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
             out_index =
                 PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
             atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
           }
-          rulebook[kernel_index * non_zero_num + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
           ++kernel_index;
         }
       }
@@ -157,68 +150,6 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
   }
 }
 
-// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
-// this kernel with phi::GatherCUDAKernel;
-// Vectorization can be used to improve read and write bandwidth
-/**
- * brief: gather data from params according to indices
- * params: the inputs
- * indices: the indices you want to gather
- * output: the outputs
- * index_size: the size of indices
- * slice_size: slice size corresponding to each index, here is the channel size
-**/
-template <typename T, typename IndexT = int>
-__global__ void GatherKernel(const T* params,
-                             const IndexT* indices,
-                             T* output,
-                             size_t index_size,
-                             size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = indices[indices_i];
-    int64_t params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
-  }
-}
-
-/**
- * brief: scatter add
- * input: the inputs
- * unique_value: refer to UpdateIndexKernel notes
- * out_index: the output feature index
- * non_zero_num: the number of output features
- * rulebook_len: the length of rulebook
- * channels: the output channel size
- * out: the outputs
-**/
-template <typename T>
-__global__ void ScatterKernel(const T* input,
-                              const int* unique_value,
-                              const int* out_index,
-                              const int non_zero_num,
-                              const int rulebook_len,
-                              const int channels,
-                              T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
-
-    int start = unique_value[indices_i];
-    int end = indices_i == non_zero_num - 1 ? rulebook_len
-                                            : unique_value[indices_i + 1];
-    // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    for (int j = start; j < end; j++) {
-      const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
-    }
-    out[indices_i * channels + channels_i] = sum;
-  }
-}
-
 // brief: calculation the distance between start and end
 __global__ void DistanceKernel(const int* start,
                                const int* end,
@@ -264,16 +195,12 @@ int ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
-  dev_ctx.Alloc(counter_per_kernel,
-                counter_per_kernel->dtype(),
-                sizeof(int) * counter_per_kernel->numel());
   int* counter_ptr = counter_per_kernel->data<int>();
-  dev_ctx.Alloc(offsets_per_kernel,
-                offsets_per_kernel->dtype(),
-                sizeof(int) * offsets_per_kernel->numel());
   int* offsets_ptr = offsets_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
-  rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
   dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
   int* rulebook_ptr = rulebook->data<int>();
 
@@ -312,7 +239,7 @@ int ProductRuleBook(const Context& dev_ctx,
   int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
                              rulebook_ptr,
-                             rulebook_ptr + 2 * kernel_size * non_zero_num,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
                              -1);
 
 #ifdef PADDLE_WITH_HIP
@@ -350,6 +277,7 @@ int ProductRuleBook(const Context& dev_ctx,
   dev_ctx.Wait();
   int rulebook_len =
       (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
+  rulebook->Resize({rulebook_rows, rulebook_len});
 
   // 3. sorted or merge the out index
   out_index->ResizeAndAllocate({rulebook_len});
@@ -365,66 +293,30 @@ int ProductRuleBook(const Context& dev_ctx,
       unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
   int* unique_key_ptr = unique_key->data<int>();
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-  InitByIndexKernel<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(
-      rulebook_len, out_index_ptr, unique_value_ptr);
-
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
-                                     rulebook_ptr + rulebook_len,
-                                     rulebook_len * sizeof(int),
-                                     hipMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
-                                     rulebook_ptr + rulebook_len,
-                                     rulebook_len * sizeof(int),
-                                     cudaMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-#endif
-
-// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
-// performance, but thrust::merge_by_key limited by data size
-#ifdef PADDLE_WITH_HIP
-  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                      unique_key_ptr,
-                      unique_key_ptr + rulebook_len,
-                      out_index_ptr);
-
-  // 4. unique
-  thrust::pair<int*, int*> new_end =
-#ifdef PADDLE_WITH_HIP
-      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                            unique_key_ptr,
-                            unique_key_ptr + rulebook_len,
-                            unique_value_ptr);
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
   // thrust::distance doesn't support stream parameters
   // const int out_non_zero_num = thrust::distance(unique_key_ptr,
   // new_end.first);
   DistanceKernel<<<1, 1>>>(unique_key_ptr,
-                           new_end.first,
-                           rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
+                           new_end,
+                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
   int out_non_zero_num = 0;
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpyAsync(
       &out_non_zero_num,
-      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
       sizeof(int),
       hipMemcpyDeviceToHost,
       dev_ctx.stream());
 #else
   phi::backends::gpu::GpuMemcpyAsync(
       &out_non_zero_num,
-      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
       sizeof(int),
       cudaMemcpyDeviceToHost,
       dev_ctx.stream());
@@ -440,8 +332,6 @@ int ProductRuleBook(const Context& dev_ctx,
   phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
   phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
 
-  dev_ctx.Alloc(
-      &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
   int* out_indices_ptr = out_indices.data<int>();
 
   config =
@@ -456,7 +346,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                           rulebook_len,
                                           d_out_dims,
                                           out_indices_ptr,
-                                          rulebook_ptr + rulebook_len);
+                                          rulebook_ptr + 2 * rulebook_len);
   out->SetMember(out_indices, out_values, out_dims, true);
   return rulebook_len;
 }
@@ -499,9 +389,12 @@ void Conv3dKernel(const Context& dev_ctx,
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
   DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
-  DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
-  DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor out_index = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor unique_key = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor unique_value = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
 
   int n = ProductRuleBook<T, Context>(dev_ctx,
                                       x,
@@ -522,6 +415,7 @@ void Conv3dKernel(const Context& dev_ctx,
 
   const int* counter_ptr = counter_per_kernel.data<int>();
   const int* offsets_ptr = counter_per_kernel.data<int>();
+  const int* rulebook_ptr = rulebook->data<int>();
 
   // 2. gather
   DenseTensorMeta in_features_meta(
@@ -532,11 +426,7 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(in_features_meta));
   phi::DenseTensor out_features =
       phi::Empty(dev_ctx, std::move(out_features_meta));
-  dev_ctx.Alloc(
-      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
   T* in_features_ptr = in_features.data<T>();
-  dev_ctx.Alloc(
-      &out_features, out_features.dtype(), sizeof(T) * out_features.numel());
   T* out_features_ptr = out_features.data<T>();
 
   auto config =
@@ -545,7 +435,7 @@ void Conv3dKernel(const Context& dev_ctx,
                          config.thread_per_block.x,
                          0,
                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                             rulebook->data<int>(),
+                                             rulebook_ptr + n,
                                              in_features_ptr,
                                              n,
                                              in_channels);
@@ -553,8 +443,6 @@ void Conv3dKernel(const Context& dev_ctx,
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
   auto* out_values = out->mutable_non_zero_elements();
-  dev_ctx.Alloc(
-      out_values, out_values->dtype(), sizeof(T) * out_values->numel());
   T* out_values_ptr = out_values->data<T>();
 
   const T* kernel_ptr = kernel.data<T>();
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index ace95b55055..c1a8b853b32 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -78,9 +78,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor indices_tensor = phi::Empty(
       dev_ctx_cpu,
       DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
-  dev_ctx_cpu.Alloc(&indices_tensor,
-                    indices_tensor.dtype(),
-                    sizeof(int) * indices_tensor.numel());
   memcpy(
       indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
   DenseTensor features_tensor = phi::Empty(
@@ -88,9 +85,6 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       {non_zero_num, in_channels},
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(&features_tensor,
-                    features_tensor.dtype(),
-                    features_tensor.numel() * sizeof(T));
   memcpy(
       features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
 
@@ -101,12 +95,18 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       kernel_dims,
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(
-      &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T));
   memcpy(kernel_tensor.data<T>(), kernel.data(), kernel.size() * sizeof(T));
 
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty<int, phi::CPUContext>(dev_ctx_cpu);
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
     SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
                                             x_tensor,
                                             kernel_tensor,
@@ -127,15 +127,6 @@ void TestConv3dBase(const std::vector<int>& indices,
                              correct_out_indices.size() * sizeof(int));
     ASSERT_EQ(cmp_indices, 0);
 
-    auto f_verify = [&](const T* real_data,
-                        const std::vector<T>& correct_data) {
-      for (uint64_t i = 0; i < correct_data.size(); i++) {
-        float tmp =
-            std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
-        ASSERT_LT(tmp, diff);
-      }
-    };
-
     f_verify(out.non_zero_elements().data<T>(), correct_out_features);
 
     if (backward) {
@@ -170,9 +161,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor d_indices_tensor = phi::Empty(
       dev_ctx_gpu,
       DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
-  dev_ctx_gpu.Alloc(&d_indices_tensor,
-                    d_indices_tensor.dtype(),
-                    sizeof(int) * d_indices_tensor.numel());
   phi::Copy(
       dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
 
@@ -181,9 +169,6 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       {non_zero_num, in_channels},
                       DataLayout::NHWC));
-  dev_ctx_gpu.Alloc(&d_features_tensor,
-                    d_features_tensor.dtype(),
-                    sizeof(T) * d_features_tensor.numel());
   phi::Copy(
       dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
 
@@ -194,13 +179,11 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       kernel_dims,
                       DataLayout::NHWC));
-  dev_ctx_gpu.Alloc(&d_kernel_tensor,
-                    d_kernel_tensor.dtype(),
-                    sizeof(T) * d_kernel_tensor.numel());
   phi::Copy(
       dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
 
-  DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
   SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
                                             d_x_tensor,
                                             d_kernel_tensor,
@@ -219,9 +202,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor h_indices_tensor = phi::Empty(
       dev_ctx_cpu,
       DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
-  dev_ctx_cpu.Alloc(&h_indices_tensor,
-                    h_indices_tensor.dtype(),
-                    sizeof(int) * h_indices_tensor.numel());
   phi::Copy(dev_ctx_gpu,
             d_out.non_zero_indices(),
             phi::CPUPlace(),
@@ -239,18 +219,34 @@ void TestConv3dBase(const std::vector<int>& indices,
                       {d_out.nnz()},
                       d_out.layout()));
 
-  dev_ctx_cpu.Alloc(&h_features_tensor,
-                    h_features_tensor.dtype(),
-                    sizeof(T) * h_features_tensor.numel());
   phi::Copy(dev_ctx_gpu,
             d_out.non_zero_elements(),
             phi::CPUPlace(),
             true,
             &h_features_tensor);
-  for (uint64_t i = 0; i < correct_out_features.size(); i++) {
-    float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
-                                             h_features_tensor.data<T>()[i]));
-    ASSERT_LT(tmp, diff);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                                                           d_x_tensor,
+                                                           d_rulebook,
+                                                           d_kernel_tensor,
+                                                           d_out,
+                                                           paddings,
+                                                           dilations,
+                                                           strides,
+                                                           1);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
+    phi::Copy(dev_ctx_gpu, grads[0], phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+
+    DenseTensor h_kernel_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[1].dtype(), grads[1].dims(), grads[1].layout()));
+    phi::Copy(dev_ctx_gpu, grads[1], phi::CPUPlace(), true, &h_kernel_grad);
+    f_verify(h_kernel_grad.data<T>(), kernel_grad);
   }
 #endif
 }
-- 
GitLab


From 95c343d3c74ff3b2c0733d4f935d9995281d019b Mon Sep 17 00:00:00 2001
From: huangxu96 <46740794+huangxu96@users.noreply.github.com>
Date: Wed, 9 Mar 2022 17:37:01 +0800
Subject: [PATCH 215/272] Fix a bug which might occur OOM problem (#40226)

* Add wait after Copy
* fix wrong place delete
---
 .../gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu        | 2 ++
 .../phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu | 1 +
 2 files changed, 3 insertions(+)

diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index 6fc65006ae2..f61cd2c3967 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -95,6 +95,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
                          norm,
                          sizeof(T),
                          dev_ctx.stream());
+    dev_ctx.Wait();
     auto eps = static_cast<T>(1e-5);
     *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
 
@@ -102,6 +103,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
     std::vector<DenseTensor *> div_outs = {in_grad};
     auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
     phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
     delete norm_tensor;
   }
   delete counts_tensor;
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index 4b6e5628c72..b0e9efe5bba 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -95,6 +95,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
                          norm,
                          sizeof(T),
                          dev_ctx.stream());
+    dev_ctx.Wait();
     auto eps = static_cast<T>(1e-5);
     *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
 
-- 
GitLab


From ec582895501b1ae4da110ce6b9fcb61ddcacb718 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Wed, 9 Mar 2022 17:57:32 +0800
Subject: [PATCH 216/272] fix the full_like with fill the value of inf (#40232)

* fix the full_like with fill the value of inf

* update the test case for the fill_any_like

* updae the comments for the full_like
---
 paddle/phi/kernels/cpu/full_kernel.cc         | 20 ++++++++++++++-----
 paddle/phi/kernels/gpu/full_kernel.cu         | 20 ++++++++++++++-----
 .../tests/unittests/test_fill_any_like_op.py  | 13 ------------
 .../tests/unittests/test_full_like_op.py      |  9 +++++++++
 4 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 86576a861aa..556de3adcf4 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -54,12 +54,22 @@ void FullLikeKernel(const Context& dev_ctx,
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index a905979f08b..852d209ee01 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -71,12 +71,22 @@ void FullLikeKernel(const Context& dev_ctx,
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 9be2e57ff0c..95537d43327 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -98,19 +98,6 @@ class TestFillAnyLikeOpType(TestFillAnyLikeOp):
         }
 
 
-class TestFillAnyLikeOpOverflow(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e100
-
-    def test_check_output(self):
-        exception = None
-        try:
-            self.check_output(check_dygraph=False)
-        except ValueError as ex:
-            exception = ex
-        self.assertIsNotNone(exception)
-
-
 class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
     def init(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index be6abb17c3c..3ae2e9ff6bd 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -62,6 +62,15 @@ class TestFullOp(unittest.TestCase):
         self.assertTrue((out.numpy() == out_numpy).all(), True)
         paddle.enable_static()
 
+    def test_full_like_fill_inf(self):
+        paddle.disable_static()
+        input = paddle.arange(6, 10, dtype='float32')
+        out = paddle.full_like(input, fill_value=float('inf'))
+        out_numpy = np.random.random((4)).astype("float32")
+        out_numpy.fill(float('inf'))
+        self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.enable_static()
+
 
 class TestFullOpError(unittest.TestCase):
     def test_errors(self):
-- 
GitLab


From 2aca8d90813170d364ed0dde6580ffc08451597a Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Wed, 9 Mar 2022 18:51:47 +0800
Subject: [PATCH 217/272] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20eigh=20o?=
 =?UTF-8?q?p=20to=20phi=20(#40213)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* migrate eigh to phi

* optimize code

* modify code according to comment

* conflict resolution
---
 paddle/fluid/operators/eigh_op.cc             |  63 +--
 paddle/fluid/operators/eigh_op.cu             |  32 --
 paddle/fluid/operators/eigh_op.h              |  74 ----
 paddle/phi/infermeta/unary.cc                 |  32 ++
 paddle/phi/infermeta/unary.h                  |   5 +
 paddle/phi/kernels/CMakeLists.txt             |   3 +-
 paddle/phi/kernels/cpu/eigh_grad_kernel.cc    |  28 ++
 paddle/phi/kernels/cpu/eigh_kernel.cc         |  43 ++
 paddle/phi/kernels/eigh_grad_kernel.h         |  29 ++
 paddle/phi/kernels/eigh_kernel.h              |  29 ++
 .../kernels/funcs/values_vectors_functor.h    | 386 ++++++++++++++++++
 paddle/phi/kernels/gpu/eigh_grad_kernel.cu    |  29 ++
 paddle/phi/kernels/gpu/eigh_kernel.cu         |  48 +++
 .../phi/kernels/impl/eigh_grad_kernel_impl.h  |  79 ++++
 paddle/phi/ops/compat/eigh_sig.cc             |  31 ++
 15 files changed, 751 insertions(+), 160 deletions(-)
 delete mode 100644 paddle/fluid/operators/eigh_op.cu
 delete mode 100644 paddle/fluid/operators/eigh_op.h
 create mode 100644 paddle/phi/kernels/cpu/eigh_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/eigh_kernel.cc
 create mode 100644 paddle/phi/kernels/eigh_grad_kernel.h
 create mode 100644 paddle/phi/kernels/eigh_kernel.h
 create mode 100644 paddle/phi/kernels/funcs/values_vectors_functor.h
 create mode 100644 paddle/phi/kernels/gpu/eigh_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/eigh_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/eigh_sig.cc

diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc
index 553d0e679cc..4e33c567eb6 100644
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eigh_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,42 +25,9 @@ using framework::Tensor;
 class EighOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues",
-                   "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors",
-                   "Eigh");
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto rank = input_dim.size();
-
-    PADDLE_ENFORCE_GE(rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions."
-                          "But received a %d dimension tensor.",
-                          rank));
-    PADDLE_ENFORCE_EQ(
-        input_dim[rank - 2], input_dim[rank - 1],
-        platform::errors::InvalidArgument(
-            "Eigh op is designed for square matrix, consequently"
-            "inner-most 2 dimensions of Input(X) should be symmetric."
-            "But received X's shape[-2] = %d and shape[-1] = %d.",
-            input_dim[rank - 2], input_dim[rank - 1]));
-
-    std::vector<int64_t> values_dim;
-
-    for (auto i = 0; i < rank - 1; i++) {
-      values_dim.emplace_back(input_dim[i]);
-    }
-
-    ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim));
-    ctx->SetOutputDim("Eigenvectors", input_dim);
-  }
 };
 
-class EignOpMaker : public framework::OpProtoAndCheckerMaker {
+class EighOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
@@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor,
+                            PD_INFER_META(phi::EighInferMeta));
 
-REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker,
+REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker,
                   ops::EighGradOpMaker<paddle::framework::OpDesc>,
-                  ops::EighGradOpMaker<paddle::imperative::OpBase>);
+                  ops::EighGradOpMaker<paddle::imperative::OpBase>,
+                  EighInferShapeFunctor);
 REGISTER_OPERATOR(eigh_grad, ops::EighGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu
deleted file mode 100644
index 827c551637d..00000000000
--- a/paddle/fluid/operators/eigh_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eigh_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
deleted file mode 100644
index 5279ec75093..00000000000
--- a/paddle/fluid/operators/eigh_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/eigen_values_vectors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EighKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
-    auto output_w = ctx.Output<Tensor>("Eigenvalues");
-    auto output_v = ctx.Output<Tensor>("Eigenvectors");
-    std::string lower = ctx.Attr<std::string>("UPLO");
-    bool is_lower = (lower == "L");
-    math::MatrixEighFunctor<DeviceContext, T> functor;
-    functor(ctx, *input, output_w, output_v, is_lower, true);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class EighGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = phi::dtype::Real<T>;
-    auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    x_grad.mutable_data<T>(ctx.GetPlace());
-    auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
-    auto& output_v = *ctx.Input<Tensor>("Eigenvectors");
-    auto& output_w_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvalues"));
-    auto& output_v_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvectors"));
-
-    auto& dims = output_v.dims();
-    const int m = dims[dims.size() - 1];
-    auto dito =
-        math::DeviceIndependenceTensorOperations<DeviceContext, T, ValueType>(
-            ctx);
-    auto tV = dito.Transpose(dito.Conj(output_v));
-    auto W = dito.template Sub<ValueType>(dito.Unsqueeze(output_w, -2),
-                                          dito.Unsqueeze(output_w, -1));
-    Tensor result = dito.Matmul(tV, output_v_grad);
-    result.mutable_data<T>(dims, ctx.GetPlace());
-    std::vector<int> out_shape = phi::vectorize<int>(dims);
-    auto constant = dito.Fill(out_shape, 0.5);
-    result = dito.Sub(result, dito.Conj(dito.Transpose(result)));
-    result = dito.Mul(result, constant);
-    result = dito.Div(result, W);
-    result = dito.DiagFill(m, m, m, 0, output_w_grad, result);
-    x_grad = dito.Matmul(output_v, dito.Matmul(result, tV));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 32744659163..544a5593014 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1123,6 +1123,38 @@ void TransposeInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v) {
+  auto input_dim = x.dims();
+  auto rank = input_dim.size();
+
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions."
+                        "But received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      input_dim[rank - 2],
+      input_dim[rank - 1],
+      phi::errors::InvalidArgument(
+          "Eigh op is designed for square matrix, consequently"
+          "inner-most 2 dimensions of Input(X) should be symmetric."
+          "But received X's shape[-2] = %d and shape[-1] = %d.",
+          input_dim[rank - 2],
+          input_dim[rank - 1]));
+
+  std::vector<int64_t> values_dim;
+
+  for (auto i = 0; i < rank - 1; i++) {
+    values_dim.emplace_back(input_dim[i]);
+  }
+  out_w->set_dims(phi::make_ddim(values_dim));
+  out_v->set_dims(input_dim);
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 735a77faefe..c57e1bdec8d 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -163,4 +163,9 @@ void TransposeInferMeta(const MetaTensor& x,
                         const std::vector<int>& axis,
                         MetaTensor* out);
 
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index de3b5b53f46..71e0d9e3479 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
@@ -38,6 +38,7 @@ kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_k
 kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
new file mode 100644
index 00000000000..5135778db56
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
new file mode 100644
index 00000000000..92fd20ca9b8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/eigh_grad_kernel.h b/paddle/phi/kernels/eigh_grad_kernel.h
new file mode 100644
index 00000000000..73df76e676a
--- /dev/null
+++ b/paddle/phi/kernels/eigh_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGardKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h
new file mode 100644
index 00000000000..dd28752d929
--- /dev/null
+++ b/paddle/phi/kernels/eigh_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
new file mode 100644
index 00000000000..b3189fc5cc3
--- /dev/null
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/memory.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cusolver.h"
+#endif  // PADDLE_WITH_CUDA
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+inline int64_t GetBatchSize(phi::DDim dims) {
+  int64_t batch_size = 1;
+  auto dim_size = dims.size();
+  for (int i = 0; i < dim_size - 2; i++) {
+    batch_size *= dims[i];
+  }
+  return batch_size;
+}
+
+static void CheckEighResult(const int batch, const int info) {
+  PADDLE_ENFORCE_LE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
+          "tridiagonal form did not converge to zero",
+          batch,
+          info));
+  PADDLE_ENFORCE_GE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] argument had an illegal value",
+          batch,
+          info));
+}
+
+template <typename DeviceContext, typename T>
+struct MatrixEighFunctor {
+  void operator()(const DeviceContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors);
+};
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices, and uses the variable has_vectors to
+// control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<CPUContext, T> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    auto dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    int values_stride = dims[dim_size - 1];
+    char uplo = is_lower ? 'L' : 'U';
+    char jobz = has_vectors ? 'V' : 'N';
+    int n = dims[dim_size - 1];
+    int64_t lda = std::max<int64_t>(1, n);
+    // if work = -1, it means that you need to use the lapack function to query
+    // the optimal value
+    int lwork = -1;      // The length of the array work
+    int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
+    int liwork = -1;     // The dimension of the array iwork
+    int iwork_opt = -1;  // The optimal length of the array liwork
+    T lwork_opt = static_cast<T>(-1);  // The optimal length of the array work
+    ValueType rwork_opt =
+        static_cast<ValueType>(-1);  // The optimal length of the array rwork
+
+    int info = 0;
+    // Call lapackEigh to get the optimal size of work data
+    phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                         uplo,
+                                         n,
+                                         input_vector,
+                                         lda,
+                                         out_value,
+                                         &lwork_opt,
+                                         lwork,
+                                         &rwork_opt,
+                                         lrwork,
+                                         &iwork_opt,
+                                         liwork,
+                                         &info);
+    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
+    liwork = std::max<int>(1, iwork_opt);
+
+    DenseTensor rwork_tensor;
+    ValueType *rwork_data = nullptr;
+
+    // complex type
+    if (input.type() == phi::DataType::COMPLEX64 ||
+        input.type() == phi::DataType::COMPLEX128) {
+      lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
+
+      rwork_tensor.Resize(phi::make_ddim({lrwork}));
+      rwork_data = dev_ctx.template Alloc<ValueType>(&rwork_tensor);
+    }
+
+    DenseTensor iwork_tensor, work_tensor;
+
+    iwork_tensor.Resize(phi::make_ddim({liwork}));
+    int *iwork_data = dev_ctx.template Alloc<int>(&iwork_tensor);
+
+    work_tensor.Resize(phi::make_ddim({lwork}));
+    T *work_data = dev_ctx.template Alloc<T>(&work_tensor);
+
+    for (auto i = 0; i < batch_size; i++) {
+      auto *value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                           uplo,
+                                           n,
+                                           input_data,
+                                           lda,
+                                           value_data,
+                                           work_data,
+                                           lwork,
+                                           rwork_data,
+                                           lrwork,
+                                           iwork_data,
+                                           liwork,
+                                           &info);
+      CheckEighResult(i, info);
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated, "
+                                  "so the eigenvectors must be provided."));
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices on GPU, and uses the variable has_vectors
+// to control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<GPUContext, T> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+    auto &dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    cublasFillMode_t uplo =
+        is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+    cusolverEigMode_t jobz =
+        has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
+
+    int n = dims[dim_size - 1];
+    int lda = std::max<int>(1, n);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
+    int lwork = 0;
+    auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
+    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
+
+    // When the input type is float32, and the feature value input dimension
+    // is greater than or equal to [*,32,32]  and less than or equal to
+    // [*,512,512], Syevj has better performance.
+    bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
+                      values_stride >= 32 && values_stride <= 512);
+    syevjInfo_t syevj_params;
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+          dev_ctx.cusolver_dn_handle(),
+          jobz,
+          uplo,
+          n,
+          reinterpret_cast<const float *>(input_vector),
+          lda,
+          reinterpret_cast<const float *>(out_value),
+          &lwork,
+          syevj_params));
+    } else {
+      EvdBuffer(dev_ctx.cusolver_dn_handle(),
+                jobz,
+                uplo,
+                n,
+                input_vector,
+                lda,
+                out_value,
+                &lwork);
+    }
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    for (auto i = 0; i < batch_size; i++) {
+      auto *input_data = input_vector + i * vector_stride;
+      auto *value_data = out_value + i * values_stride;
+      auto handle = dev_ctx.cusolver_dn_handle();
+      if (use_syevj) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::cusolverDnSsyevj(handle,
+                                      jobz,
+                                      uplo,
+                                      n,
+                                      reinterpret_cast<float *>(input_data),
+                                      lda,
+                                      reinterpret_cast<float *>(value_data),
+                                      reinterpret_cast<float *>(work_ptr),
+                                      lwork,
+                                      info_ptr,
+                                      syevj_params));
+      } else {
+        Evd(handle,
+            jobz,
+            uplo,
+            n,
+            input_data,
+            lda,
+            value_data,
+            work_ptr,
+            lwork,
+            info_ptr);
+      }
+      int error_info = 0;
+      paddle::memory::Copy(phi::CPUPlace(),
+                           &error_info,
+                           dev_ctx.GetPlace(),
+                           info_ptr,
+                           sizeof(int),
+                           dev_ctx.stream());
+      CheckEighResult(i, error_info);
+    }
+
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnDestroySyevjInfo(syevj_params));
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated,"
+                                  "so the eigenvectors must be provided."));
+      //   input_trans = dito.Transpose(input_trans);
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+
+  using ValueType = phi::dtype::Real<T>;
+  inline void EvdBuffer(cusolverDnHandle_t handle,
+                        cusolverEigMode_t jobz,
+                        cublasFillMode_t uplo,
+                        int n,
+                        const T *A,
+                        int lda,
+                        const ValueType *W,
+                        int *lwork) const;
+
+  inline void Evd(cusolverDnHandle_t handle,
+                  cusolverEigMode_t jobz,
+                  cublasFillMode_t uplo,
+                  int n,
+                  T *A,
+                  int lda,
+                  ValueType *W,
+                  T *work,
+                  int lwork,
+                  int *devInfo) const;
+};
+
+using phi::dtype::complex;
+
+#define FUNC_WITH_TYPES(m)                       \
+  m(float, Ssy, float) m(double, Dsy, double) m( \
+      complex<float>, Che, cuComplex) m(complex<double>, Zhe, cuDoubleComplex)
+
+#define EVDBUFFER_INSTANCE(T, C, CastType)                             \
+  template <>                                                          \
+  inline void MatrixEighFunctor<GPUContext, T>::EvdBuffer(             \
+      cusolverDnHandle_t handle,                                       \
+      cusolverEigMode_t jobz,                                          \
+      cublasFillMode_t uplo,                                           \
+      int n,                                                           \
+      const T *A,                                                      \
+      int lda,                                                         \
+      const ValueType *W,                                              \
+      int *lwork) const {                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##evd_bufferSize( \
+        handle,                                                        \
+        jobz,                                                          \
+        uplo,                                                          \
+        n,                                                             \
+        reinterpret_cast<const CastType *>(A),                         \
+        lda,                                                           \
+        W,                                                             \
+        lwork));                                                       \
+  }
+
+FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
+
+#define EVD_INSTANCE(T, C, CastType)                                           \
+  template <>                                                                  \
+  inline void MatrixEighFunctor<GPUContext, T>::Evd(cusolverDnHandle_t handle, \
+                                                    cusolverEigMode_t jobz,    \
+                                                    cublasFillMode_t uplo,     \
+                                                    int n,                     \
+                                                    T *A,                      \
+                                                    int lda,                   \
+                                                    ValueType *W,              \
+                                                    T *work,                   \
+                                                    int lwork,                 \
+                                                    int *devInfo) const {      \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
+        dynload::cusolverDn##C##evd(handle,                                    \
+                                    jobz,                                      \
+                                    uplo,                                      \
+                                    n,                                         \
+                                    reinterpret_cast<CastType *>(A),           \
+                                    lda,                                       \
+                                    W,                                         \
+                                    reinterpret_cast<CastType *>(work),        \
+                                    lwork,                                     \
+                                    devInfo));                                 \
+  }
+
+FUNC_WITH_TYPES(EVD_INSTANCE);
+
+#undef FUNC_WITH_TYPES
+#undef EVDBUFFER_INSTANCE
+#undef EVD_INSTANCE
+
+#endif  // PADDLE_WITH_CUDA
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
new file mode 100644
index 00000000000..fdf61dc7399
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
new file mode 100644
index 00000000000..4ff3b371b6a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
new file mode 100644
index 00000000000..2f0530b638f
--- /dev/null
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  auto& dims = out_v.dims();
+  const int m = dims[dims.size() - 1];
+  DenseTensor tV =
+      phi::TransposeLast2Dim<T>(dev_ctx, phi::Conj<T>(dev_ctx, out_v));
+  DenseTensor W =
+      phi::Subtract<phi::dtype::Real<T>>(dev_ctx,
+                                         phi::funcs::Unsqueeze(out_w, -2),
+                                         phi::funcs::Unsqueeze(out_w, -1));
+  DenseTensor result = phi::Matmul<T>(dev_ctx, tV, dout_v);
+  result.Resize(dims);
+  dev_ctx.template Alloc<T>(&result);
+
+  std::vector<int> out_shape = phi::vectorize<int>(dims);
+  DenseTensor constant;
+  constant.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&constant);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, &constant, T(0.5));
+  result = phi::Subtract<T>(
+      dev_ctx,
+      result,
+      phi::Conj<T>(dev_ctx, phi::TransposeLast2Dim<T>(dev_ctx, result)));
+  result = phi::Multiply<T>(dev_ctx, result, constant);
+  if (result.type() != W.type()) {
+    auto x_vector = EigenVector<T>::Flatten(result);
+    auto y_vector = EigenVector<phi::dtype::Real<T>>::Flatten(W);
+    auto out_vector = EigenVector<T>::Flatten(result);
+    auto& place = *dev_ctx.eigen_device();
+    out_vector.device(place) = x_vector / y_vector;
+  } else {
+    result = phi::Divide<T>(dev_ctx, result, W);
+  }
+  result = phi::funcs::DiagFill<T, phi::dtype::Real<T>>(
+      dev_ctx, m, m, m, 0, dout_w, result);
+  *dx = phi::Matmul<T>(dev_ctx, out_v, phi::Matmul<T>(dev_ctx, result, tV));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/eigh_sig.cc b/paddle/phi/ops/compat/eigh_sig.cc
new file mode 100644
index 00000000000..e50a9a5a12a
--- /dev/null
+++ b/paddle/phi/ops/compat/eigh_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("eigh_grad",
+                         {"Eigenvalues",
+                          "Eigenvectors",
+                          GradVarName("Eigenvalues"),
+                          GradVarName("Eigenvectors")},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping);
-- 
GitLab


From 1defc8f3b3b4aaf7c1e3c517730cefea23766316 Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Wed, 9 Mar 2022 18:59:14 +0800
Subject: [PATCH 218/272] change timeout for pool (#40341)

---
 python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 6169509e895..8f7b73fc0e0 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -80,7 +80,7 @@ if(WITH_NV_JETSON)
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
 else()
-  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 120)
+  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 300)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
 endif()
 set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
-- 
GitLab


From f40ed5f421d64028c9781c2b77aeb2958327b090 Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Wed, 9 Mar 2022 19:04:50 +0800
Subject: [PATCH 219/272] add_sharding_api (#40129)

---
 python/paddle/distributed/__init__.py         |   1 +
 .../sharding_optimizer_stage2.py              |   6 +-
 .../meta_parallel/sharding/sharding_stage2.py |  19 +-
 .../meta_parallel/sharding/sharding_stage3.py |   9 +-
 .../paddle/distributed/sharding/__init__.py   |  17 ++
 .../distributed/sharding/group_sharded.py     | 211 ++++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../unittests/dygraph_group_sharded_api.py    | 147 ++++++++++++
 .../unittests/dygraph_sharding_stage3.py      |   8 +-
 .../test_dygraph_group_sharded_api.py         |  31 +++
 python/paddle/framework/io.py                 |   8 +-
 python/setup.py.in                            |   1 +
 12 files changed, 437 insertions(+), 24 deletions(-)
 create mode 100644 python/paddle/distributed/sharding/__init__.py
 create mode 100644 python/paddle/distributed/sharding/group_sharded.py
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index fc299bc7b55..a0ae9bc29da 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -55,6 +55,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
 from . import cloud_utils  # noqa: F401
 from . import utils  # noqa: F401
 
+from .sharding import *  # noqa: F401
 
 __all__ = [  # noqa
       "spawn",
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 112c3887fcf..a31f8bbfed0 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -40,8 +40,6 @@ align = {
     Type.fp32.value: 4,
 }
 
-__all__ = ["ShardingOptimizerStage2"]
-
 
 class ShardingOptimizerStage2(Optimizer):
     """
@@ -136,7 +134,7 @@ class ShardingOptimizerStage2(Optimizer):
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self._update_opt_status()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
@@ -392,7 +390,7 @@ class ShardingOptimizerStage2(Optimizer):
         self._dtype_rank_params.clear()
         self._param2rank.clear()
 
-    @fluid.dygraph.no_grad
+    @paddle.autograd.no_grad()
     def _broadcast_params(self):
         """Broadcast the parameters of the current rank to each rank"""
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 392a7f3ac5d..548f036067e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -63,8 +63,7 @@ class ShardingStage2(nn.Layer):
             sync_buffers=False,
             buffer_max_size=2**23,  #8MB
             auto_refresh_trainable=True,
-            device="gpu",
-            use_grad_storage=True):
+            device="gpu"):
         super().__init__()
 
         # training options
@@ -102,9 +101,10 @@ class ShardingStage2(nn.Layer):
         # Set grad storage size & Display param sizes and model sizes
         model_size = sum(
             [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        assert buffer_max_size >= 0, "buffer_max_size must be GE than 0."
         self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                        model_size)
-        self._use_grad_storage = use_grad_storage
+        self._use_grad_storage = buffer_max_size > 0
         self._grad_storages = {}  # {dtype: {rank: GradStorage}}
         self._has_grad_storage = []
         self._grad_storage_list = []
@@ -255,7 +255,7 @@ class ShardingStage2(nn.Layer):
         # wait next func hook support
         self._setup_backward_hooks()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def __sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -277,7 +277,7 @@ class ShardingStage2(nn.Layer):
         except AttributeError:
             return getattr(self._layer, name)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _clear_counters(self):
         """Reset all the grad reduce and call counters."""
         if self.training:
@@ -290,13 +290,13 @@ class ShardingStage2(nn.Layer):
     def _get_reduce_fn(self, index, param, dst_rank):
         """
         There are two ways to reduce gradient.
-        - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately.
+        - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately.
         - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks.
         """
 
         if not self._use_grad_storage or not self._has_grad_storage[index]:
             # Direct reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -336,7 +336,7 @@ class ShardingStage2(nn.Layer):
 
         else:
             # Buffer reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -421,9 +421,6 @@ class ShardingStage2(nn.Layer):
         Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters.
         """
 
-        if not self._use_grad_storage:
-            return
-
         # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank
         self._grad_storages = {}
         self._has_grad_storage = [False for _ in self._trainable_params]
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index de69836fdba..bcf63a54cc4 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -84,6 +84,7 @@ class ShardingStage3(nn.Layer):
         self._offload = offload
         self._sync_comm = sync_comm
         # segmentation size
+        assert segment_size >= 0, "segment_size must be GE than 0."
         self._segment_size = segment_size
 
         global DEV
@@ -158,7 +159,7 @@ class ShardingStage3(nn.Layer):
         self._redefine_opt_step()
         self._redefine_opt_clear()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
@@ -408,7 +409,7 @@ class ShardingStage3(nn.Layer):
         # register post forward hooks
         sub_layer.register_forward_post_hook(_forward_post_hook)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -521,7 +522,7 @@ class ShardingStage3(nn.Layer):
             param._register_backward_hook(allreduce_function)
 
     def _get_allreduce_fn(self, param):
-        @paddle.no_grad()
+        @paddle.autograd.no_grad()
         def reduce(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
@@ -840,7 +841,7 @@ def _allgather_buffer(trainable_params,
     return task_flow
 
 
-@paddle.no_grad()
+@paddle.autograd.no_grad()
 def _create_params_grad(trainable_params, param2buffer_size, task_flow):
     for param in trainable_params:
         if param.name in task_flow.full_grad.keys():
diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py
new file mode 100644
index 00000000000..d14e3dd099f
--- /dev/null
+++ b/python/paddle/distributed/sharding/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .group_sharded import group_sharded_parallel, save_group_sharded_model  # noqa: F401
+
+__all__ = ['group_sharded_parallel', 'save_group_sharded_model']
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
new file mode 100644
index 00000000000..2fdb20600f6
--- /dev/null
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from enum import Enum
+
+import paddle
+
+from paddle.optimizer import Optimizer
+from paddle.distributed.utils import get_logger
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+logger_ = get_logger(logging.INFO)
+
+
+def group_sharded_parallel(model,
+                           optimizer,
+                           level,
+                           scaler=None,
+                           group=None,
+                           offload=False,
+                           sync_buffers=False,
+                           buffer_max_size=2**23,
+                           segment_size=2**20,
+                           sync_comm=False):
+    """
+    Use this module to configure and wrap up the parameters of the group shared module.
+
+    Args:
+        model (Layer): The layer to be wrapped with group_sharded_parallel.
+        optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
+        level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
+        scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None.
+        group (Group, optional): The group instance. Defaults to None.d
+        offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False.
+        sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False.
+        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23.
+        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20.
+        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False.
+    
+    Returns:
+        model: A wrapper for group sharded given model.
+        optimizer: A wrapper for group sharded given optimizer.
+        scaler: A wrapper for group sharded given scaler.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+    """
+    # check optition type
+    assert isinstance(
+        model,
+        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+    assert isinstance(
+        optimizer, Optimizer
+    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
+    assert level in ['os', 'os_g', 'p_g_os'
+                     ], "The level must be os, os_g or p_g_os."
+
+    def check_dtype(param):
+        return param.dtype == paddle.float16
+
+    params_fp16 = filter(check_dtype, model.parameters())
+    if scaler is None and len(params_fp16) > 0:
+        raise ValueError("Please enter the correct scaler.")
+    # convert model/optimizer/scaler
+    if level in ['os', 'os_g']:
+        logger_.info("*" * 30)
+        logger_.info("Sharded level os uses sharded level os_g achieved now.")
+        logger_.info("*" * 30)
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(),
+            optim=optimizer,
+            group=group,
+            offload=offload)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            buffer_max_size=buffer_max_size)
+    elif level == 'p_g_os':
+        model = ShardingStage3(
+            model,
+            optimizer=optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            segment_size=segment_size,
+            offload=offload,
+            sync_comm=sync_comm)
+    else:
+        raise ValueError("Please enter the correct level.")
+    if params_fp16 and isinstance(scaler, paddle.amp.GradScaler):
+        scaler = ShardingScaler(scaler)
+    logger_.info("*" * 30)
+    logger_.info(
+        "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
+    )
+    logger_.info("*" * 30)
+
+    return model, optimizer, scaler
+
+
+def save_group_sharded_model(model, output, optimizer=None):
+    """
+    Group sharded encapsulated model and optimizer state saving module.
+
+    Args:
+        model (Layer): A wrapper for group sharded given model.
+        output (str): Save directory.
+        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # save model and optimizer state_dict
+            save_group_sharded_model(model, optimizer，output=output_dir)
+    """
+    logger_.info(
+        "==========Begin to save group sharded model and optimizer==========")
+    assert not os.path.isfile(
+        output
+    ), "Saving directory ({}) should be a directory, not a file".format(output)
+    os.makedirs(output, exist_ok=True)
+    output_model = os.path.join(output, "model.pdmodel")
+    if isinstance(model, ShardingStage2):
+        paddle.save(model._layer.state_dict(), output_model)
+    elif isinstance(model, ShardingStage3):
+        convert2cpu = True if model._offload else False
+        model.get_all_parameters(convert2cpu=convert2cpu)
+        paddle.save(model._layer.state_dict(), output_model)
+    else:
+        raise ValueError(
+            "Please use the layer which is wrapped with group_sharded_parallel.")
+
+    if optimizer is not None:
+        assert hasattr(
+            optimizer, "_optim"
+        ), "Please use the optimizer which is wrapped with group_sharded_parallel."
+        output_opt = os.path.join(output, "model.pdopt")
+        paddle.save(optimizer._optim.state_dict(), output_opt)
+    logger_.info(
+        "==========End to save group sharded model and optimizer==========")
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 5d861cddea2..9b0c857576b 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -47,6 +47,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
+list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -282,6 +283,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -1123,6 +1125,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
new file mode 100644
index 00000000000..d4832782c32
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import shutil
+import tempfile
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+epoch = 10
+paddle.seed(2022)
+np.random.seed(2022)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+batch_size = 100
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.Momentum(
+        parameters=[{
+            "params": list(model.parameters())
+        }] if opt_group else list(model.parameters()),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model, shard_level, use_pure_fp16, output_dir):
+    group = paddle.distributed.new_group([0, 1])
+
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+
+    model, optimizer, scaler = group_sharded_parallel(
+        model=model, optimizer=optimizer, level=shard_level, scaler=scaler)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+
+            optimizer.clear_grad()
+
+    save_group_sharded_model(model, output=output_dir, optimizer=optimizer)
+    return model.parameters()
+
+
+def test_sharding_api():
+    mlp, mlp1, mlp2 = MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+
+    output_dir = tempfile.mkdtemp()
+
+    # fp16
+    stage2_params = train_mlp(
+        mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir)
+    stage3_params = train_mlp(
+        mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir)
+
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage3_params[i].numpy(),
+            rtol=1e-4,
+            atol=1e-3)
+    shutil.rmtree(output_dir)
+
+
+if __name__ == '__main__':
+    test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index 6b755cf4c2b..bbbcb621fd4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -83,7 +83,7 @@ def train_mlp(model,
               accumulate_grad=False,
               batch_size=100,
               opt_group=False,
-              recompute=False,
+              sync_comm=False,
               test_minimize=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
@@ -104,7 +104,7 @@ def train_mlp(model,
             model, optimizer, group=group, buffer_max_size=2**21)
     elif sharding_stage == 3:
         model = ShardingStage3(
-            model, optimizer=optimizer, group=group, sync_comm=recompute)
+            model, optimizer=optimizer, group=group, sync_comm=sync_comm)
 
     # check optimizer.minimize() error
     if test_minimize:
@@ -225,7 +225,7 @@ def test_stage2_stage3():
             rtol=1e-4,
             atol=1e-3)
 
-    # fp16 recompute
+    # fp16 sync_comm
     stage3_params = train_mlp(
         mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
     stage3_params_re = train_mlp(
@@ -233,7 +233,7 @@ def test_stage2_stage3():
         sharding_stage=3,
         use_pure_fp16=True,
         opt_group=False,
-        recompute=True)
+        sync_comm=True)
     for i in range(len(stage3_params)):
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
new file mode 100644
index 00000000000..7c296c7e40e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphGroupSharded(TestMultipleGpus):
+
+    # check group sharded logic as well as the accuracy with single mode
+    def test_dygraph_group_sharded(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_api.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 94b8bd29b2c..f2d41b5e9b1 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -46,6 +46,10 @@ def _build_saved_state_dict(state_dict):
             if value.type == core.VarDesc.VarType.VOCAB:
                 save_dict[key] = value.value().get_map_tensor()
             else:
+                if not value.value().get_tensor()._is_initialized():
+                    raise ValueError(
+                        "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model."
+                    )
                 save_dict[key] = value.numpy()
             name_table[key] = value.name
         else:
@@ -466,7 +470,9 @@ def _parse_load_result(obj, return_numpy):
 
 def _save_lod_tensor(tensor, file_name):
     if not tensor._is_initialized():
-        raise ValueError("The saved tensor is not initialized.")
+        raise ValueError(
+            "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model firstly."
+        )
     if _is_file_path(file_name):
         _seek = core.save_lod_tensor(tensor, file_name)
         # '_seek' is the end position of this tensor in the file.
diff --git a/python/setup.py.in b/python/setup.py.in
index 118f617361f..3ce22892b6e 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -280,6 +280,7 @@ packages=['paddle',
           'paddle.incubate.nn',
           'paddle.incubate.passes',
           'paddle.distribution',
+          'paddle.distributed.sharding',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
-- 
GitLab


From 63fb0347eb9dc72aafdff654a250e22be333de1e Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 9 Mar 2022 19:06:55 +0800
Subject: [PATCH 220/272] [PHI] Fix some bug of code auto-gen in C++ API
 (#40262)

* support code auto-gene for sparse backward api

* fix bug of intermediate api and name of return var
---
 .../final_state_generator/eager_gen.py        |  2 +-
 paddle/phi/api/CMakeLists.txt                 |  2 +-
 paddle/phi/api/lib/CMakeLists.txt             | 12 ++++-
 python/paddle/utils/code_gen/api_base.py      | 18 +++++--
 python/paddle/utils/code_gen/api_gen.py       | 53 ++++++++++++++++---
 .../paddle/utils/code_gen/backward_api_gen.py | 16 +++---
 .../paddle/utils/code_gen/sparse_api_gen.py   | 12 ++---
 .../utils/code_gen/sparse_bw_api_gen.py       | 16 +++---
 8 files changed, 92 insertions(+), 39 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index b594faa80a8..f56cf8ef24c 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -671,7 +671,7 @@ def GenerateNodeCreationCodes(
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                outputs_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
             else:
                 assert IsVectorTensorType(rtype)
                 output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index d632db046d1..a1b0af609ca 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(lib)
-cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api)
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 926ddf8ba49..42bf7a8103f 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -37,8 +37,16 @@ set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_
 set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
 set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
 set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
-set(sparse_api_header_file_tmp ${api_header_file}.tmp)
-set(sparse_api_source_file_tmp ${api_source_file}.tmp)
+set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp)
+set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
 
 # sparse bw api file
 set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 68127fb522c..fe68548a22a 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -301,12 +301,12 @@ class BaseAPI(object):
 
     def gene_api_declaration(self):
         api_declaration = f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
 """
 
         if self.is_base_api and self.inplace_map is not None:
             api_declaration = api_declaration + f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
 """
 
         return api_declaration
@@ -675,6 +675,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
         return input_tensor_code, kernel_args[:-2], kernel_signature
 
+    # Override by child class
+    def gene_return_type_code(self):
+        return self.outputs['return_type']
+
+    # Override by child class
+    def gene_return_code(self):
+        return "api_output"
+
     # Override by child class
     def gene_output(self,
                     output_type_list,
@@ -703,7 +711,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -726,12 +734,12 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         api_code = f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
 {self.gene_kernel_select()}
 """
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 1bdfa8b6697..058cc08465f 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -23,7 +23,8 @@ from api_base import BaseAPI
 class ForwardAPI(BaseAPI):
     def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
-        self.is_dygraph_api = self.parse_intermediate(api_item_yaml)
+        self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -33,15 +34,47 @@ class ForwardAPI(BaseAPI):
 
     def parse_intermediate(self, api_item_yaml):
         if 'intermediate' in api_item_yaml:
-            return True
+            intermediate_outs = [
+                item.strip()
+                for item in api_item_yaml['intermediate'].split(',')
+            ]
+            return True, intermediate_outs
         else:
-            return False
+            return False, []
 
     def get_return_type(self, out_type_list):
         return out_type_list[0] if len(
             out_type_list) == 1 else "std::tuple<" + ",".join(
                 out_type_list) + ">"
 
+    def gene_return_type_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return self.outputs['return_type']
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(self.outputs['types'][i])
+            return return_out_list[0] if len(
+                return_out_list) == 1 else "std::tuple<" + ",".join(
+                    return_out_list) + ">"
+
+    def gene_return_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return "api_output"
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(i)
+            if len(return_out_list) == 1:
+                return f"std::get<{return_out_list[0]}>(api_output)"
+            else:
+                selected_code = [
+                    f"std::get<{i}>(api_output)" for i in return_out_list
+                ]
+            return '{' + ", ".join(selected_code) + '}'
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -58,12 +91,12 @@ class ForwardAPI(BaseAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out;"""
+{code_indent}  {self.outputs['return_type']} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -71,10 +104,10 @@ class ForwardAPI(BaseAPI):
                 if inplace_flag and self.inplace_map is not None and self.outputs[
                         'names'][i] in self.inplace_map:
                     output_create = output_create + f"""
-{code_indent}  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(out));"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(api_output));"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -169,6 +202,10 @@ def generate_api(api_yaml_path, header_file_path, source_file_path,
         if foward_api.is_dygraph_api:
             dygraph_header_file.write(foward_api.gene_api_declaration())
             dygraph_source_file.write(foward_api.gene_api_code())
+
+            foward_api.is_dygraph_api = False
+            header_file.write(foward_api.gene_api_declaration())
+            source_file.write(foward_api.gene_api_code())
         else:
             header_file.write(foward_api.gene_api_declaration())
             source_file.write(foward_api.gene_api_code())
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 7bd488cc114..7417d6bb030 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -87,33 +87,33 @@ class BackwardAPI(BaseAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out({len(output_type_list)});"""
+{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
                 if out_type_item == 'Tensor':
-                    get_out_code = f'&out[{i}][0]'
+                    get_out_code = f'&api_output[{i}][0]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
 
                     else:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back();"""
+{code_indent}  api_output[{i}].emplace_back();"""
 
                 else:
-                    get_out_code = f'&out[{i}]'
+                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
 {code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {get_out_code});"""
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index d845653f488..8ba090f8ca8 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -60,12 +60,12 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_s
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} out{inplace_assign};
-  auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} out;"""
+  {self.outputs['return_type']} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -73,10 +73,10 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_s
                 if inplace_flag and self.inplace_map is not None and self.outputs[
                         'names'][i] in self.inplace_map:
                     output_create = output_create + f"""
-  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -155,7 +155,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_s
 {kernel_context_code}
   phi_kernel(&kernel_context);
 
-  return out;"""
+  return api_output;"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name()
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index 6ef294caa14..ff87968f86d 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -53,33 +53,33 @@ class SparseBackwardAPI(SparseAPI, BackwardAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} out{inplace_assign};
-  auto kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} out({len(output_type_list)});"""
+  {self.outputs['return_type']} api_output({len(output_type_list)});"""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
                 if out_type_item == 'Tensor':
-                    get_out_code = f'&out[{i}][0]'
+                    get_out_code = f'&api_output[{i}][0]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-  out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
 
                     else:
                         output_create = output_create + f"""
-  out[{i}].emplace_back();"""
+  api_output[{i}].emplace_back();"""
 
                 else:
-                    get_out_code = f'&out[{i}]'
+                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-  out[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
   auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
-- 
GitLab


From cd28cddbfb5f5643947291e9a640ecd414dc8dae Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 9 Mar 2022 20:11:10 +0800
Subject: [PATCH 221/272] [PHI] Move set_value kernel to phi (#40195)

* save code

* fix bug of set_value

* add coverage test
---
 paddle/fluid/framework/operator.cc            |  65 +-
 paddle/fluid/framework/operator.h             |   4 +-
 paddle/fluid/imperative/execution_context.h   |   5 +
 paddle/fluid/imperative/prepared_operator.h   |  61 +-
 paddle/fluid/operators/set_value_op.cc        |   7 -
 paddle/fluid/operators/set_value_op.cu        |   7 -
 paddle/fluid/operators/set_value_op.h         | 195 -----
 paddle/phi/core/kernel_utils.h                |   1 +
 paddle/phi/kernels/cpu/set_value_kernel.cc    |  38 +
 paddle/phi/kernels/gpu/set_value_kernel.cu    |  38 +
 .../phi/kernels/impl/set_value_kernel_impl.h  | 337 ++++++++
 paddle/phi/kernels/set_value_kernel.h         |  49 ++
 paddle/phi/ops/compat/set_value_sig.cc        | 736 ++++++++++++++++++
 paddle/phi/tests/ops/test_op_signature.cc     | 370 +++++++++
 14 files changed, 1701 insertions(+), 212 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/set_value_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/set_value_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/set_value_kernel_impl.h
 create mode 100644 paddle/phi/kernels/set_value_kernel.h
 create mode 100644 paddle/phi/ops/compat/set_value_sig.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index eff6d9a9102..f8e30c1ee29 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const {
   return var != nullptr;
 }
 
+bool ExecutionContext::HasInputs(const std::string& name) const {
+  const auto& ins = ctx_.inputs;
+  auto it = ins.find(name);
+  if (it == ins.end() || it->second.empty()) {
+    return false;
+  }
+  for (const auto* input : it->second) {
+    if (input == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool ExecutionContext::HasOutput(const std::string& name) const {
   auto* var = OutputVar(name);
   return var != nullptr;
@@ -2189,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext(
             std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = Attrs().at(attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
       auto& attr = Attrs().at(attr_names[i]);
@@ -2212,7 +2271,11 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e33d4feb82a..1a1171f1dba 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -295,6 +295,8 @@ class ExecutionContext {
 
   virtual bool HasInput(const std::string& name) const;
 
+  virtual bool HasInputs(const std::string& name) const;
+
   virtual bool HasOutput(const std::string& name) const;
 
   virtual size_t InputSize(const std::string& name) const {
@@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
       : ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
-    return ctx_.HasInput(name);
+    return ctx_.HasInputs(name);
   }
 
   bool HasOutput(const std::string& name) const override {
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index fe5ac73b004..fbc47f81fd3 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext {
     return (it != var_map_in_.end() && it->second.size() > 0);
   }
 
+  bool HasInputs(const std::string& name) const override {
+    auto it = var_map_in_.find(name);
+    return (it != var_map_in_.end() && it->second.size() > 0);
+  }
+
   bool HasOutput(const std::string& name) const override {
     auto it = var_map_out_.find(name);
     return (it != var_map_out_.end() && it->second.size() > 0);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 30dbe07d7af..d7c0c8cc547 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -332,6 +332,7 @@ void BuildDygraphPhiKernelContext(
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
+    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -409,6 +410,60 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<bool>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
@@ -432,7 +487,11 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          kernel_ctx->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index ec3e04e71fa..7d0d782b837 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
                   ops::SetValueOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu
index f9701b0acaa..9f291a863c0 100644
--- a/paddle/fluid/operators/set_value_op.cu
+++ b/paddle/fluid/operators/set_value_op.cu
@@ -16,13 +16,6 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, bool>);
-
 REGISTER_OP_CUDA_KERNEL(
     set_value_grad,
     ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 9dd72795920..4d459f8c01b 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -121,201 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       "of target shape: %d, but now shape is %d.",
       second.to_str(), first.to_str()));
 }
-
-template <typename DeviceContext, typename T>
-class SetValueKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const int rank = ctx.Input<framework::LoDTensor>("Input")->dims().size();
-
-    // TODO(liym27): A more elegent code to do this. C++ has to make template
-    //  integer as constant, but we had better have alternative writing in the
-    //  future.
-    switch (rank) {
-      case 1:
-        SetValueCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.", rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueCompute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::LoDTensor>("Input");
-    auto* value_tensor = ctx.Input<framework::LoDTensor>("ValueTensor");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    auto starts_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<framework::Tensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StepsTensorList");
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-
-    auto in_dims = in->dims();
-    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
-
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-
-    auto place = ctx.GetPlace();
-    auto& eigen_place =
-        *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // Here copy data from input to avoid data loss at PE and Graph level.
-    // TODO(liym27): Speed up in the future version.
-    // - Q: Why don't call ShareDataWith to speed up?
-    // - A: Because it's not supported to ShareDataWith on OP's input and output
-    // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
-    // - Q: Why don't delete Input, after all, the input and output are the same
-    // Tensor at program level?
-    // - A: If deleting Input, the graph will be complex, such as there will
-    // be two ops points to the output in graph: op1 -> output <- set_value.
-    // In this case, we have to find a way to handle the running order of
-    // set_value is what we want.
-    paddle::framework::TensorCopy(*in, place, out);
-
-    Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype());
-    slice_tensor.mutable_data<T>(slice_dims, place);
-    pad_tensor.mutable_data<T>(in_dims, place);
-
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
-    auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
-
-    // Step 1: Set the value of out at `_index` to zero
-    slice_e.device(eigen_place) = slice_e.constant(T(0));
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-    for (size_t i = 0; i < D; ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-      if (starts[i] == ends[i]) {  // slice is empty, data will not be changed
-        return;
-      }
-    }
-
-    out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 2: Set a tensor with the same shape as out tensor. And its data at
-    // '_index' is the same as value_tensor, and data out of '_index' to zero
-
-    // - Step 2.1 Set slice tensor with value
-
-    // NOTE(liym27): [ Why resize slice_tensor here? ]
-    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
-    // slice_tensor should be decreased dims.
-    // e.g.
-    //  x[:,0] = value_tensor
-    // x's shape = [3, 4], value_tensor's shape = [3]
-    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
-    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
-    // shape is [3, 3], which cross the border;
-    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
-    // is [3], which is right.
-
-    slice_tensor.Resize(slice_dims_for_assign);
-    if (value_tensor != nullptr) {
-      CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
-      // ElementwiseComputeEx can do broadcasting
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
-    } else {
-      Tensor value_t(in->dtype());
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-      value_t.mutable_data<T>(value_dims, place);
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
-    }
-    slice_tensor.Resize(slice_dims);
-
-    // - Step 2.2 Pad slice tensor with 0
-    pad_e.device(eigen_place) = pad_e.constant(T(0));
-    pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 3: Set out tensor with value_tensor
-    out_e.device(eigen_place) = out_e - pad_e;
-  }
-};
-
 template <typename DeviceContext, typename T>
 class SetValueGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index baa549d7a66..2cc82772cf8 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -252,6 +252,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
   PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
   PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<Scalar>&);
 
   /* Output Helpers */
 
diff --git a/paddle/phi/kernels/cpu/set_value_kernel.cc b/paddle/phi/kernels/cpu/set_value_kernel.cc
new file mode 100644
index 00000000000..dcf278cd94e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_kernel.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
new file mode 100644
index 00000000000..f788da010b6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
new file mode 100644
index 00000000000..5aebffe51b5
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -0,0 +1,337 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/slice_utils.h"
+
+namespace phi {
+
+// check whether the tensor with dimension of second can assign to the
+// tensor with dimension of first
+inline void CheckIsDimsMatch(const DDim& first, const DDim& second) {
+  int ignore_axis1 = 0, ignore_axis2 = 0;
+  for (; ignore_axis1 < first.size(); ++ignore_axis1) {
+    if (first[ignore_axis1] != 1) {
+      break;
+    }
+  }
+  for (; ignore_axis2 < second.size(); ++ignore_axis2) {
+    if (second[ignore_axis2] != 1) {
+      break;
+    }
+  }
+
+  if (second.size() == ignore_axis2) {
+    // second tensor has only one value
+    return;
+  }
+
+  if (first.size() - ignore_axis1 >= second.size() - ignore_axis2) {
+    auto idx1 = first.size() - 1;
+    auto idx2 = second.size() - 1;
+    bool is_match = true;
+    for (; idx2 >= ignore_axis2; idx2--) {
+      if (first[idx1--] != second[idx2] && second[idx2] != 1) {
+        is_match = false;
+        break;
+      }
+    }
+    if (is_match) {
+      return;
+    }
+  }
+  PADDLE_THROW(errors::InvalidArgument(
+      "The shape of tensor assigned value must match the shape "
+      "of target shape: %d, but now shape is %d.",
+      second.to_str(),
+      first.to_str()));
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueImpl(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& value,
+                  const ScalarArray& starts,
+                  const ScalarArray& ends,
+                  const ScalarArray& steps,
+                  const std::vector<int64_t>& axes,
+                  const std::vector<int64_t>& decrease_axes,
+                  const std::vector<int64_t>& none_axes,
+                  DenseTensor* out) {
+  auto in_dims = in.dims();
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::CheckAndUpdateSliceAttrs(
+      in_dims, axes, &starts_local, &ends_local, &steps_local);
+  auto slice_dims = paddle::operators::GetSliceDims(
+      in_dims, axes, starts_local, ends_local, &steps_local);
+  auto decrease_slice_dims =
+      paddle::operators::GetDecreasedDims(slice_dims, decrease_axes);
+
+  auto slice_dims_for_assign = decrease_slice_dims;
+  if (!none_axes.empty()) {
+    std::vector<int64_t> slice_dims_with_none;
+
+    size_t none_axes_cur = 0, decrease_axes_cur = 0;
+    for (int i = 0; i < slice_dims.size(); ++i) {
+      while (none_axes_cur < none_axes.size() &&
+             none_axes[none_axes_cur] <= i) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
+      if (decrease_axes_cur < decrease_axes.size() &&
+          decrease_axes[decrease_axes_cur] == i) {
+        decrease_axes_cur++;
+      } else {
+        slice_dims_with_none.push_back(slice_dims[i]);
+      }
+    }
+    while (none_axes_cur < none_axes.size()) {
+      slice_dims_with_none.push_back(1);
+      none_axes_cur++;
+    }
+
+    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+  }
+
+  auto place = dev_ctx.GetPlace();
+  auto& eigen_place = *dev_ctx.eigen_device();
+
+  // Here copy data from input to avoid data loss at PE and Graph level.
+  // TODO(liym27): Speed up in the future version.
+  // - Q: Why don't call ShareDataWith to speed up?
+  // - A: Because it's not supported to ShareDataWith on OP's input and output
+  // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
+  // - Q: Why don't delete Input, after all, the input and output are the same
+  // Tensor at program level?
+  // - A: If deleting Input, the graph will be complex, such as there will
+  // be two ops points to the output in graph: op1 -> output <- set_value.
+  // In this case, we have to find a way to handle the running order of
+  // set_value is what we want.
+  Copy(dev_ctx, in, place, false, out);
+
+  DenseTensor slice_tensor =
+      Empty<T>(dev_ctx, ScalarArray{slice_dims.Get(), slice_dims.size()});
+  DenseTensor pad_tensor =
+      Empty<T>(dev_ctx, ScalarArray{in_dims.Get(), in_dims.size()});
+
+  auto pad_e = EigenTensor<T, RANK>::From(pad_tensor, in_dims);
+  auto out_e = EigenTensor<T, RANK>::From(*out);
+  auto slice_e = EigenTensor<T, RANK>::From(slice_tensor, slice_dims);
+
+  // Step 1: Set the value of out at `_index` to zero
+  slice_e.device(eigen_place) = slice_e.constant(T(0));
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+
+  for (size_t i = 0; i < RANK; ++i) {
+    starts_indices[i] = 0;
+    ends_indices[i] = slice_dims[i];
+    strides_indices[i] = 1;
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int axis_index = axes[i];
+    starts_indices[axis_index] = starts_local[i];
+    ends_indices[axis_index] = ends_local[i];
+    strides_indices[axis_index] = steps_local[i];
+    if (starts_local[i] ==
+        ends_local[i]) {  // slice is empty, data will not be changed
+      return;
+    }
+  }
+
+  out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 2: Set a tensor with the same shape as out tensor. And its data at
+  // '_index' is the same as value, and data out of '_index' to zero
+
+  // - Step 2.1 Set slice tensor with value
+
+  // NOTE(liym27): [ Why resize slice_tensor here? ]
+  // A: When do broadcasting on slice_tensor and value, the shape of
+  // slice_tensor should be decreased dims.
+  // e.g.
+  //  x[:,0] = value
+  // x's shape = [3, 4], value's shape = [3]
+  // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+  // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+  // shape is [3, 3], which cross the border;
+  // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+  // is [3], which is right.
+
+  slice_tensor.Resize(slice_dims_for_assign);
+  CheckIsDimsMatch(slice_dims_for_assign, value.dims());
+  // ElementwiseComputeEx can do broadcasting
+  funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+      dev_ctx,
+      slice_tensor,
+      value,
+      -1,
+      funcs::SubtractFunctor<T>(),
+      &slice_tensor);
+
+  slice_tensor.Resize(slice_dims);
+
+  // - Step 2.2 Pad slice tensor with 0
+  pad_e.device(eigen_place) = pad_e.constant(T(0));
+  pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 3: Set out tensor with value
+  out_e.device(eigen_place) = out_e - pad_e;
+}
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out) {
+  const int rank = x.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueImpl<T, Context, 1>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 2:
+      SetValueImpl<T, Context, 2>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 3:
+      SetValueImpl<T, Context, 3>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 4:
+      SetValueImpl<T, Context, 4>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 5:
+      SetValueImpl<T, Context, 5>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 6:
+      SetValueImpl<T, Context, 6>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.", rank));
+  }
+}
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out) {
+  std::vector<T> assgin_values;
+  assgin_values.reserve(values.size());
+  for (const auto& val : values) {
+    assgin_values.push_back(val.to<T>());
+  }
+  DenseTensor value_tensor = Empty<T>(dev_ctx, shape);
+  paddle::framework::TensorFromVector(assgin_values, dev_ctx, &value_tensor);
+  value_tensor.Resize(phi::make_ddim(shape));
+
+  SetTensorValueKernel<T, Context>(dev_ctx,
+                                   x,
+                                   value_tensor,
+                                   starts,
+                                   ends,
+                                   steps,
+                                   axes,
+                                   decrease_axes,
+                                   none_axes,
+                                   out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/set_value_kernel.h b/paddle/phi/kernels/set_value_kernel.h
new file mode 100644
index 00000000000..271691b1a35
--- /dev/null
+++ b/paddle/phi/kernels/set_value_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
new file mode 100644
index 00000000000..eacfff26d53
--- /dev/null
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -0,0 +1,736 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Input")) {
+    if (ctx.HasInput("StartsTensorList")) {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    } else {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping);
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index a6c9a27de7d..88c9193a8f8 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -114,5 +114,375 @@ TEST(ARG_MAP, fill_constant) {
   ASSERT_EQ(signature9.name, "full_sr");
 }
 
+TEST(ARG_MAP, set_value) {
+  TestArgumentMappingContext arg_case(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case1(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case1).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case2).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case3(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case3).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case4(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case4).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case5(
+      {"Input", "StartsTensorList", "EndsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case5).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case6(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case6).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case7(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case7).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case8(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case8).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case9(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case9).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case10(
+      {"Input", "StartsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case10).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case11(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case11).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case12(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case12).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case13(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case13).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case14(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case14).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case15(
+      {"Input", "StartsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case15).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case16(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case16).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case17(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case17).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case18(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case18).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case19(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case19).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case20(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case20).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case21(
+      {"Input", "EndsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case21).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case22(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case22).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case23(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case23).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case24(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case24).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case25(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case25).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case26(
+      {"Input", "EndsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case26).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case27(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case27).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case28(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case28).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case29(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case29).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case30(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case30).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case31(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case31).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case32(
+      {"Input", "StepsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case32).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case33(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case33).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case34(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case34).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case35(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case35).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case36(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case36).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case37(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case37).name,
+      "set_value");
+}
+
 }  // namespace tests
 }  // namespace phi
-- 
GitLab


From 0604df9e70dfe7be8a21df6a80d9fa6d4939bd9d Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Wed, 9 Mar 2022 20:40:34 +0800
Subject: [PATCH 222/272] [Dy2st]Fix Exception in utils.py function
 "is_paddle_module" (#40243)

---
 python/paddle/fluid/dygraph/dygraph_to_static/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 04474dcdfe5..d440e387da5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -191,7 +191,7 @@ def is_api_in_module(node, module_prefix):
 
         return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
                                                                 module_prefix))
-    except NameError:
+    except Exception:
         return False
 
 
@@ -227,7 +227,7 @@ def is_numpy_api(node):
         # TODO: find a better way
         if not module_result:
             return func_str.startswith("numpy.") or func_str.startswith("np.")
-    except NameError:
+    except Exception:
         return False
 
 
-- 
GitLab


From 452c75b8034e485a2626e22cac39c95c07b883b4 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 9 Mar 2022 21:37:32 +0800
Subject: [PATCH 223/272] move elementwise mul grad (#40252)

---
 .../new_executor/standalone_executor_test.cc  |   2 +-
 .../elementwise/elementwise_functor.h         |  41 ---
 .../elementwise/elementwise_mul_op.cc         |  49 ----
 .../elementwise/elementwise_mul_op.cu         |  68 -----
 .../elementwise/elementwise_mul_op.h          | 238 ---------------
 .../kernels/cpu/elementwise_grad_kernel.cc    |  61 +++-
 paddle/phi/kernels/elementwise_grad_kernel.h  |  39 +++
 .../phi/kernels/funcs/elementwise_functor.h   |  44 +++
 paddle/phi/kernels/gpu/elementwise_grad.h     |  37 +++
 .../kernels/gpu/elementwise_grad_kernel.cu    |  54 ++++
 .../impl/elementwise_grad_kernel_impl.h       | 273 ++++++++++++++++++
 paddle/phi/ops/compat/elementwise_sig.cc      |  34 +++
 12 files changed, 539 insertions(+), 401 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 62d87b6917e..a69cc0d6b86 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -46,7 +46,7 @@ USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
-USE_OP(elementwise_mul_grad);
+USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP(sigmoid_grad);
 USE_OP(tanh_grad);
 USE_OP(sum);
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 8e0bf78e9b7..14baeaa74d2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -196,47 +196,6 @@ struct MinGradXYFunctor {
   }
 };
 
-template <typename T>
-struct MulGradFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
-};
-template <typename T>
-struct MulGradFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a * b_conj;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * y
-    outs[0] = a * b;
-    // dy = dout * x
-    outs[1] = a * c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    // dx = dout * y
-    Complex<InT> b_conj(b.real, -b.imag);
-    outs[0] = a * b_conj;
-    // dy = dout * x
-    Complex<InT> c_conj(c.real, -c.imag);
-    outs[1] = a * c_conj;
-    return outs;
-  }
-};
-
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index e172279145e..830e09eeae4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL(
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 45c87a27a18..f7b9fd1e265 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -63,33 +63,6 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y, x};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, MulGradFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, x};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dy, MulGradFunctor<T>());
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c81266d5844..58a3123c7e3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-template <typename T>
-struct MulGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-};
-
-template <typename T>
-struct MulGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <typename T>
-struct MulGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-};
-
-template <typename T>
-struct MulGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    // ddout = ddx * y + x * ddy
-    // change computation sequence to save memory, so ddout can inplace ddx and
-    // dx can be used as 'tmp' tensor
-    // (1) dx = x * ddy
-    // (2) dy = dout * ddx
-    // (3) ddout = ddx * y
-    // (4) ddout = ddout + dx
-    // (5) dx = dout * ddy
-    if (ddout) {
-      int axis = ctx.Attr<int>("axis");
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
-      if (ddout->numel() > ddx->numel()) {
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX<T>(),
-            MulGradDY<T>());
-
-        Tensor ddout_tmp;
-        ddout_tmp.mutable_data<T>(ddout->dims(), ctx.GetPlace());
-
-        default_elementwise_mul<DeviceContext, T>(ctx, y, &ddx_safe, ddout);
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, x,
-                                                  &ddout_tmp);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      } else {
-        // use dx to save memory, other than alloc tmp tensor
-        Tensor* ddout_tmp = dx;
-
-        default_elementwise_mul<DeviceContext, T>(ctx, x, &ddy_safe, ddout_tmp);
-        // NOTE: in the following ElemwiseGradCompute, for the
-        // first output tensor is nullptr, the branch to calculate first
-        // output tensor will not be activated, DivGradDx function will not
-        // be called and can be ignored, the first branch has little effect
-        // on running speed.
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy,
-            MulGradDX<T>(), MulGradDY<T>());
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, y, ddout);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(*ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-        default_elementwise_mul<DeviceContext, T>(ctx, dout, &ddy_safe, dx);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    // get input
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>("DOut");
-    auto* ddx = ctx.Input<framework::Tensor>("DDX");
-    auto* ddy = ctx.Input<framework::Tensor>("DDY");
-
-    auto* d_dx = ctx.Input<framework::Tensor>("D_DX");
-    auto* d_dy = ctx.Input<framework::Tensor>("D_DY");
-    auto* d_ddout = ctx.Input<framework::Tensor>("D_DDOut");
-
-    // get output
-    auto* out_d_x = ctx.Output<framework::Tensor>("D_X");
-    auto* out_d_y = ctx.Output<framework::Tensor>("D_Y");
-    auto* out_d_dout = ctx.Output<framework::Tensor>("D_DOut");
-
-    auto* out_d_ddx = ctx.Output<framework::Tensor>("D_DDX");
-    auto* out_d_ddy = ctx.Output<framework::Tensor>("D_DDY");
-
-    if (out_d_x) out_d_x->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_y) out_d_y->mutable_data<T>(y->dims(), ctx.GetPlace());
-    if (out_d_dout) out_d_dout->mutable_data<T>(dout->dims(), ctx.GetPlace());
-    if (out_d_ddx) out_d_ddx->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_ddy) out_d_ddy->mutable_data<T>(y->dims(), ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    if (d_ddout) {
-      if (out_d_x) {
-        // out_d_x = ddy * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_ddout,
-                                                  out_d_x);
-      }
-      if (out_d_y) {
-        // out_d_y = ddx * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, d_ddout,
-                                                  out_d_y);
-      }
-    }
-
-    if (out_d_dout) {
-      // get out_d_dout
-      // out_d_dout = ddy * d_dx + d_dy * ddx
-      Tensor out_d_dout_tmp;
-      out_d_dout_tmp.mutable_data<T>(dout->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, d_dy, &ddx_safe,
-                                                out_d_dout);
-      default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_dx,
-                                                &out_d_dout_tmp);
-      auto out_d_dout_t = framework::EigenVector<T>::Flatten(*out_d_dout);
-      auto out_d_dout_tmp_t =
-          framework::EigenVector<T>::Flatten(out_d_dout_tmp);
-      out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t;
-    }
-
-    if (out_d_ddx) {
-      // get out_d_ddx
-      // out_d_ddx = dout * d_dy + y * d_ddout
-      Tensor out_d_ddx_tmp;
-      out_d_ddx_tmp.mutable_data<T>(ddx->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dy, out_d_ddx);
-      default_elementwise_mul<DeviceContext, T>(ctx, y, d_ddout,
-                                                &out_d_ddx_tmp);
-      auto out_d_ddx_t = framework::EigenVector<T>::Flatten(*out_d_ddx);
-      auto out_d_ddx_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddx_tmp);
-      out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t;
-    }
-
-    if (out_d_ddy) {
-      // get out_d_ddy
-      // out_d_ddy = dout * d_dx + x * d_ddout
-      Tensor out_d_ddy_tmp;
-      out_d_ddy_tmp.mutable_data<T>(ddy->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dx, out_d_ddy);
-      default_elementwise_mul<DeviceContext, T>(ctx, x, d_ddout,
-                                                &out_d_ddy_tmp);
-      auto out_d_ddy_t = framework::EigenVector<T>::Flatten(*out_d_ddy);
-      auto out_d_ddy_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddy_tmp);
-      out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t;
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index c9177f1c46e..cd513e809fd 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -121,6 +121,20 @@ void DivideGradKernel(const Context& dev_ctx,
       dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
 }
 
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -193,8 +207,8 @@ PD_REGISTER_KERNEL(divide_grad,
                    double,
                    int,
                    int64_t,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(divide_double_grad,
                    CPU,
@@ -204,5 +218,44 @@ PD_REGISTER_KERNEL(divide_double_grad,
                    double,
                    int,
                    int64_t,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index bcd5a98f07e..58ae11a9c42 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -85,4 +85,43 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* dy,
                             DenseTensor* dout,
                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 5615a450b5c..b01d50015f0 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -160,5 +160,49 @@ struct DivGradYFunctor<ComplexType<T>> {
   }
 };
 
+template <typename T>
+struct MultiplyGradFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
+};
+template <typename T>
+struct MultiplyGradFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    // dx = dout * y
+    ComplexType<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 98df65c92f3..e5432b5f918 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -360,4 +360,41 @@ void ElementwiseDivGrad(const GPUContext &dev_ctx,
   }
 }
 
+/*
+******************************
+    Mul Grad
+******************************
+*/
+
+template <typename T>
+void ElementwiseMulGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis) {
+  const auto place = dev_ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y, &x};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MultiplyGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor<T>());
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 45c8b9a2163..81f7fac1088 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -136,6 +136,18 @@ void DivideGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -228,3 +240,45 @@ PD_REGISTER_KERNEL(divide_double_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index e8831f90213..65427e87506 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -259,4 +259,277 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T>
+struct MulGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
+};
+
+template <typename T>
+struct MulGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
+
+/*
+******************************
+    Multiply Grad
+******************************
+*/
+
+template <typename T>
+struct MulGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
+};
+
+template <typename T>
+struct MulGradDY<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout) {
+  if (ddout) dev_ctx.template Alloc<T>(ddout);
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  // dx = dout * ddy
+  // dy = dout * ddx
+  // ddout = ddx * y + x * ddy
+  // change computation sequence to save memory, so ddout can inplace ddx and
+  // dx can be used as 'tmp' tensor
+  // (1) dx = x * ddy
+  // (2) dy = dout * ddx
+  // (3) ddout = ddx * y
+  // (4) ddout = ddout + dx
+  // (5) dx = dout * ddy
+  if (ddout) {
+    auto& place = *dev_ctx.eigen_device();
+    // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
+    if (ddout->numel() > ddx.get_ptr()->numel()) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+
+      DenseTensor ddout_tmp;
+      ddout_tmp.Resize(ddout->dims());
+      dev_ctx.template Alloc<T>(&ddout_tmp);
+
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, y, ddx_safe, ddout, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, x, &ddout_tmp, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+    } else {
+      // use dx to save memory, other than alloc tmp tensor
+      DenseTensor* ddout_tmp = dx;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, x, ddy_safe, ddout_tmp, axis);
+      // NOTE: in the following ElemwiseGradCompute, for the
+      // first output tensor is nullptr, the branch to calculate first
+      // output tensor will not be activated, DivGradDx function will not
+      // be called and can be ignored, the first branch has little effect
+      // on running speed.
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          nullptr,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, y, ddout, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(*ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, dout, ddy_safe, dx, axis);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy) {
+  if (d_x) {
+    d_x->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_x);
+  }
+  if (d_y) {
+    d_y->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_y);
+  }
+  if (d_dout) {
+    d_dout->Resize(dout.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_ddx) {
+    d_ddx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  if (d_ddy) {
+    d_ddy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_ddy);
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  if (d_ddout.get_ptr()) {
+    if (d_x) {
+      // d_x = ddy * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, *(d_ddout.get_ptr()), d_x, axis);
+    }
+    if (d_y) {
+      // d_y = ddx * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, *(d_ddout.get_ptr()), d_y, axis);
+    }
+  }
+
+  if (d_dout) {
+    // get d_dout
+    // d_dout = ddy * d_dx + d_dy * ddx
+    DenseTensor d_dout_tmp;
+    d_dout_tmp.Resize(dout.dims());
+    dev_ctx.template Alloc<T>(&d_dout_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, d_dy, ddx_safe, d_dout, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, ddy_safe, d_dx, &d_dout_tmp, axis);
+    auto d_dout_t = phi::EigenVector<T>::Flatten(*d_dout);
+    auto d_dout_tmp_t = phi::EigenVector<T>::Flatten(d_dout_tmp);
+    d_dout_t.device(place) = d_dout_t + d_dout_tmp_t;
+  }
+
+  if (d_ddx) {
+    // get d_ddx
+    // d_ddx = dout * d_dy + y * d_ddout
+    DenseTensor d_ddx_tmp;
+    d_ddx_tmp.Resize(ddx->dims());
+    dev_ctx.template Alloc<T>(&d_ddx_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dy, d_ddx, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, y, *(d_ddout.get_ptr()), &d_ddx_tmp, axis);
+    auto d_ddx_t = phi::EigenVector<T>::Flatten(*d_ddx);
+    auto d_ddx_tmp_t = phi::EigenVector<T>::Flatten(d_ddx_tmp);
+    d_ddx_t.device(place) = d_ddx_t + d_ddx_tmp_t;
+  }
+
+  if (d_ddy) {
+    // get d_ddy
+    // d_ddy = dout * d_dx + x * d_ddout
+    DenseTensor d_ddy_tmp;
+    d_ddy_tmp.Resize(ddy->dims());
+    dev_ctx.template Alloc<T>(&d_ddy_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dx, d_ddy, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, x, *(d_ddout.get_ptr()), &d_ddy_tmp, axis);
+    auto d_ddy_t = phi::EigenVector<T>::Flatten(*d_ddy);
+    auto d_ddy_tmp_t = phi::EigenVector<T>::Flatten(d_ddy_tmp);
+    d_ddy_t.device(place) = d_ddy_t + d_ddy_tmp_t;
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index d4a25866907..fc890fa3a49 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -122,6 +122,31 @@ KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
                          {GradVarName("Y"), "DOut", "DDOut"});
 }
 
+KernelSignature ElementwiseMulGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseMulDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_double_grad",
+                         {"X", "Y", "DOut", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y"), "DDOut"});
+}
+
+KernelSignature ElementwiseMulTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiply_triple_grad",
+      {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"},
+      {"axis"},
+      {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -135,6 +160,9 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -158,3 +186,9 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
                            phi::ElementwiseDivGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
                            phi::ElementwiseDivDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad,
+                           phi::ElementwiseMulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
+                           phi::ElementwiseMulDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
+                           phi::ElementwiseMulTripleGradOpArgumentMapping);
-- 
GitLab


From b97e6d13fd552df98bda8156e7851d21399c6579 Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Wed, 9 Mar 2022 22:38:14 +0800
Subject: [PATCH 224/272] [phi] move viterbi_decode to phi (#40186)

* move viterbi to phi

* move infershape to phi

* update infershape

* fix

* resolve conflicts
---
 paddle/fluid/operators/viterbi_decode_op.cc   |  53 +--
 paddle/fluid/operators/viterbi_decode_op.cu   | 206 --------
 paddle/fluid/operators/viterbi_decode_op.h    | 438 ------------------
 paddle/phi/infermeta/ternary.cc               |  47 ++
 paddle/phi/infermeta/ternary.h                |   8 +
 .../phi/kernels/cpu/viterbi_decode_kernel.cc  | 319 +++++++++++++
 .../kernels/funcs/viterbi_decode_functor.h    | 140 ++++++
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  | 402 ++++++++++++++++
 paddle/phi/kernels/viterbi_decode_kernel.h    |  30 ++
 9 files changed, 953 insertions(+), 690 deletions(-)
 delete mode 100644 paddle/fluid/operators/viterbi_decode_op.cu
 delete mode 100644 paddle/fluid/operators/viterbi_decode_op.h
 create mode 100644 paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/viterbi_decode_functor.h
 create mode 100644 paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
 create mode 100644 paddle/phi/kernels/viterbi_decode_kernel.h

diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc
index bf1cdeed65a..602376d54e0 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cc
+++ b/paddle/fluid/operators/viterbi_decode_op.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input in ViterbiDecode  must be 3. But "
-                          "received Input's rank is %d.",
-                          in_dims.size()));
-    auto length_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(length_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The rank of Length in ViterbiDecode must be 1. But "
-                          "received Length's rank is %d.",
-                          length_dims.size()));
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(
-        transition_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of Transition in ViterbiDecode must be 2. But "
-            "received Transition's rank is %d.",
-            transition_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0], length_dims[0],
-          platform::errors::InvalidArgument(
-              "The batch size of Input and Length should be equal."));
-      PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The number of tags of Input (%d) and Transition "
-                            "(%d) should be equal.",
-                            transition_dims[0], in_dims[2]));
-    }
-    ctx->SetOutputDim("Scores", length_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 namespace platform = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor,
+                            PD_INFER_META(phi::ViterbiDecodeInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp,
-                             ops::ViterbiDecodeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    viterbi_decode, ops::ViterbiDecodeKernel<platform::CPUDeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CPUDeviceContext, double>);
+                             ops::ViterbiDecodeOpMaker,
+                             ViterbiDecodeInferShapeFunctor);
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
deleted file mode 100644
index 68628fb2748..00000000000
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/viterbi_decode_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-int64_t ComputeBlockSize(int64_t col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256)
-    return 512;
-  else if (col > 128)
-    return 256;
-  else if (col > 64)
-    return 128;
-  else if (col > 32)
-    return 64;
-  else if (col > 16)
-    return 32;
-  else if (col > 8)
-    return 16;
-  else
-    return 8;
-}
-
-template <template <typename T> typename BinaryFunctor, typename T>
-struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* output) {
-    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
-    std::vector<framework::Tensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, -1,
-                                                      BinaryFunctor<T>());
-  }
-};
-
-template <template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
-    std::vector<framework::Tensor*> outs = {mask};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
-  }
-};
-
-template <typename T, typename IndType, size_t BlockDim>
-__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
-                                 const int64_t width,      // c
-                                 const int64_t post_size,  // h
-                                 const T* in, IndType* out_idx, T* out) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  cub::ArgMax reducer;
-  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    cub::KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      // return max, argmax
-      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
-      if (out != nullptr) out[idx] = kv_pair.value;
-    }
-    __syncthreads();
-  }
-}
-
-__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int start = idx; idx < num; idx += gridDim.x) {
-    data[idx] = idx * scale;
-  }
-}
-
-template <>
-struct ARange<platform::CUDADeviceContext> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, int64_t* data,
-                  int num, int64_t scale) {
-    int64_t kBlockDim = ComputeBlockSize(num);
-    // kBlockDim > num at most of time, so we can set grid = 1
-    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
-  }
-};
-
-template <typename T, typename IndType>
-struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t numel = input.numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    const auto& dev_ctx = ctx.cuda_device_context();
-    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-    int64_t height = pre * post;
-    int64_t width = n;
-    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, in_data, out_idx_data, out_data));
-    }
-  }
-};
-
-template <typename T>
-struct GetMaxValue<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& input, T* max_value) {
-    framework::Tensor out_data;
-    out_data.Resize(phi::make_ddim({1}));
-    out_data.mutable_data<T>(platform::CUDAPlace());
-    switch (ComputeBlockSize(input.numel())) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1, input.numel(), 1, input.data<int64_t>(), nullptr,
-              out_data.data<int64_t>()));
-    }
-    framework::Tensor max_value_tensor;
-    framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
-    *max_value = max_value_tensor.data<T>()[0];
-  }
-};
-
-template <typename T, typename IndexT>
-struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& src, const framework::Tensor& index,
-                  framework::Tensor* output) {
-    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    viterbi_decode,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
deleted file mode 100644
index e7fe743b964..00000000000
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/compare_functors.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename IndType>
-struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    int64_t height = pre * post;
-    int64_t width = n;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-// Reduce
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < height; ++i) {
-      int64_t h = i / post;
-      int64_t w = i % post;
-      IndType max_idx = -1;
-      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
-      for (int64_t j = 0; j < width; ++j) {
-        if (in_data[h * width * post + j * post + w] > max_value) {
-          max_value = in_data[h * width * post + j * post + w];
-          max_idx = j;
-        }
-      }
-      out_data[i] = max_value;
-      out_idx_data[i] = max_idx;
-    }
-  }
-};
-
-template <typename DeviceContext>
-struct ARange {
-  void operator()(const DeviceContext& dev_ctx, int64_t* data, int end,
-                  int64_t scale) {
-    for (int i = 0; i < end; ++i) {
-      data[i] = i * scale;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
-                  T* max_value) {
-    auto input_ptr = input.data<T>();
-    auto num = input.numel();
-    *max_value = *std::max_element(input_ptr, input_ptr + num);
-  }
-};
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-struct Gather {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
-                  const framework::Tensor& index, framework::Tensor* output) {
-    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const framework::Tensor& lhs,
-                      const framework::Tensor& rhs, framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  OutT* out_ptr = out->data<OutT>();
-  Functor functor;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
-  }
-}
-
-template <typename DeviceContext,
-          template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
-  }
-};
-
-template <bool is_multi_threads>
-struct GetInputIndex {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    for (int j = 0; j < out_dims_size; ++j) {
-      int curr_idx = output_idx / output_strides[j];
-      output_idx %= output_strides[j];
-      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
-      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
-    }
-  }
-};
-
-template <>
-struct GetInputIndex<false> {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
-                                               index_array);
-    *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
-                                               index_array);
-    phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                            index_array);
-  }
-};
-
-template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
-                             const framework::Tensor& rhs,
-                             framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  T* out_ptr = out->data<T>();
-  int out_size = static_cast<int>(out->dims().size());
-  std::vector<int> out_dims(out_size);
-  std::vector<int> lhs_dims(out_size);
-  std::vector<int> rhs_dims(out_size);
-  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
-  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
-  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
-  std::vector<int> output_strides(out_size, 1);
-  std::vector<int> lhs_strides(out_size, 1);
-  std::vector<int> rhs_strides(out_size, 1);
-  std::vector<int> index_array(out_size, 0);
-  // calculate strides
-  for (int i = out_size - 2; i >= 0; --i) {
-    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
-    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
-    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
-  }
-  Functor functor;
-  GetInputIndex<is_multi_threads> get_input_index;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    int lhs_idx = 0;
-    int rhs_idx = 0;
-    get_input_index(lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides,
-                    output_strides, i, index_array.data(), &lhs_idx, &rhs_idx);
-    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
-  }
-}
-
-template <typename DeviceContext, template <typename T> typename BinaryFunctor,
-          typename T>
-struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
-                  const framework::Tensor& rhs, framework::Tensor* output) {
-    if (lhs.dims() == rhs.dims()) {
-      SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
-    } else {
-      bool is_multi_threads = false;
-#ifdef PADDLE_WITH_MKLML
-      if (omp_get_max_threads() > 1) {
-        is_multi_threads = true;
-      }
-#endif
-      if (is_multi_threads) {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(lhs, rhs, output);
-      } else {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(lhs, rhs, output);
-      }
-    }
-  }
-};
-
-class TensorBuffer {
- public:
-  explicit TensorBuffer(const framework::LoDTensor& in)
-      : buffer_(in), offset_(0) {
-    buffer_.Resize({buffer_.numel()});
-  }
-  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
-    int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                   std::multiplies<int64_t>());
-    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
-    offset_ += size;
-    block.Resize(shape);
-    return block;
-  }
-
- private:
-  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
-  int offset_;
-};
-
-template <typename DeviceContext, typename T>
-class ViterbiDecodeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<framework::Tensor>("Input");
-    auto batch_size = static_cast<int>(input->dims()[0]);
-    auto seq_len = static_cast<int>(input->dims()[1]);
-    auto n_labels = static_cast<int>(input->dims()[2]);
-    phi::funcs::SetConstant<DeviceContext, T> float_functor;
-    phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<framework::Tensor> historys;
-    // We create tensor buffer in order to avoid allocating memory frequently
-    // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-    int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    framework::LoDTensor int_buffer;
-    int_buffer.Resize(phi::make_ddim({buffer_size}));
-    int_buffer.mutable_data<int64_t>(ctx.GetPlace());
-    TensorBuffer int_tensor_buffer(int_buffer);
-    // create float tensor buffer
-    // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
-    buffer_size = batch_size * (seq_len + 10) * n_labels +
-                  (batch_size + 2) * n_labels * n_labels;
-    framework::LoDTensor float_buffer;
-    float_buffer.Resize(phi::make_ddim({buffer_size}));
-    float_buffer.mutable_data<T>(ctx.GetPlace());
-    TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<framework::Tensor>("Length");
-    framework::Tensor left_length =
-        int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
-    int64_t max_seq_len = 0;
-    GetMaxValue<DeviceContext, int64_t> get_max_value;
-    get_max_value(dev_ctx, left_length, &max_seq_len);
-
-    auto* scores = ctx.Output<framework::Tensor>("Scores");
-    scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<framework::Tensor>("Path");
-    path->Resize({batch_size, max_seq_len});
-    path->mutable_data<int64_t>(curr_place);
-    framework::Tensor tpath =
-        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
-    auto batch_path = Unbind(tpath);
-    for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
-      it->Resize({batch_size});
-    }
-    // create and init required tensor
-    framework::Tensor input_exp =
-        float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<framework::Tensor>("Transition");
-    framework::Tensor trans_exp =
-        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
-    framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
-    trans_exp.Resize({1, n_labels, n_labels});
-    framework::Tensor alpha =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &zero, 0);
-    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &one, 1);
-    framework::Tensor float_one =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    framework::Tensor alpha_trn_sum =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    framework::Tensor alpha_max =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor alpha_argmax =
-        int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    framework::Tensor alpha_nxt =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor zero_len_mask =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor float_mask =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::Tensor stop_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor start_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor rest_trans =
-        float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor last_ids_tmp =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor batch_offset =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor gather_idx =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
-                                                &start_trans};
-    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
-                                            &start_trans};
-    math::SplitFunctor<DeviceContext, T> split_functor;
-    split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
-    stop_trans.Resize({1, n_labels});
-    start_trans.Resize({1, n_labels});
-    auto logit0 = input_exp.Slice(0, 1);
-    logit0.Resize({batch_size, n_labels});
-    BinaryOperation<DeviceContext, AddFunctor, T> AddFloat;
-    BinaryOperation<DeviceContext, AddFunctor, int64_t> AddInt;
-    BinaryOperation<DeviceContext, MulFunctor, T> MulFloat;
-    BinaryOperation<DeviceContext, MulFunctor, int64_t> MulInt;
-    BinaryOperation<DeviceContext, SubFunctor, T> SubFloat;
-    BinaryOperation<DeviceContext, SubFunctor, int64_t> SubInt;
-    if (include_bos_eos_tag) {
-      AddFloat(dev_ctx, logit0, start_trans, &alpha);
-      GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
-                                                            one, &float_mask);
-      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-    } else {
-      alpha = logit0;
-    }
-    SubInt(dev_ctx, left_length, one, &left_length);
-    Argmax<DeviceContext, T, int64_t> argmax;
-    for (int64_t i = 1; i < max_seq_len; ++i) {
-      framework::Tensor logit = input_exp.Slice(i, i + 1);
-      logit.Resize({batch_size, n_labels});
-      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
-      AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
-      auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
-      alpha_argmax_temp.Resize({batch_size, n_labels});
-      argmax(ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
-      historys.emplace_back(alpha_argmax_temp);
-      AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
-      alpha.Resize({batch_size, n_labels});
-      // mask = paddle.cast((left_length > 0), dtype='float32')
-      // alpha = mask * alpha_nxt + (1 - mask) * alpha
-      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, T>()(
-          ctx, left_length, zero, &float_mask);
-      // alpha_nxt = mask * alpha_nxt
-      MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
-      // inv_mask = 1 - mask
-      SubFloat(dev_ctx, float_one, float_mask, &float_mask);
-      // alpha = (1 - mask) * alpha
-      MulFloat(dev_ctx, alpha, float_mask, &alpha);
-      // alpha += alpha_nxt
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      if (include_bos_eos_tag) {
-        GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
-                                                              one, &float_mask);
-        // alpha += mask * trans_exp[:, self.stop_idx]
-        MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-        AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      }
-      SubInt(dev_ctx, left_length, one, &left_length);
-    }
-    argmax(ctx, alpha, &last_ids, scores, 1);
-    left_length.Resize({batch_size});
-    GetMask<DeviceContext, phi::funcs::GreaterEqualFunctor, int64_t>()(
-        ctx, left_length, zero, &int_mask);
-    // last_ids_update = last_ids * tag_mask
-    int last_ids_index = 1;
-    int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
-    MulInt(dev_ctx, last_ids, int_mask,
-           &batch_path[actual_len - last_ids_index]);
-    // The algorithm below can refer to
-    // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
-    ARange<DeviceContext> arange;
-    arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
-    Gather<DeviceContext, int64_t, int64_t> gather;
-    for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
-      ++last_ids_index;
-      AddInt(dev_ctx, left_length, one, &left_length);
-      AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      framework::Tensor& last_ids_update =
-          batch_path[actual_len - last_ids_index];
-      hist->Resize({batch_size * n_labels});
-      gather(dev_ctx, *hist, gather_idx, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, int64_t>()(
-          ctx, left_length, zero, &int_mask);
-      MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::EqualFunctor, int64_t>()(
-          ctx, left_length, zero, &zero_len_mask);
-      MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
-      SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
-      MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
-      AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::LessThanFunctor, int64_t>()(
-          ctx, left_length, zero, &int_mask);
-      MulInt(dev_ctx, last_ids, int_mask, &last_ids);
-      AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
-    }
-    TransCompute<DeviceContext, int64_t>(2, dev_ctx, tpath, path, {1, 0});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index eb807ad4615..67a82392411 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -192,6 +192,53 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input in ViterbiDecode  must be 3. But "
+                        "received Input's rank is %d.",
+                        in_dims.size()));
+  auto length_dims = length.dims();
+  PADDLE_ENFORCE_EQ(length_dims.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The rank of Length in ViterbiDecode must be 1. But "
+                        "received Length's rank is %d.",
+                        length_dims.size()));
+  auto transition_dims = transition.dims();
+  PADDLE_ENFORCE_EQ(
+      transition_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Transition in ViterbiDecode must be 2. But "
+          "received Transition's rank is %d.",
+          transition_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        in_dims[0],
+        length_dims[0],
+        phi::errors::InvalidArgument(
+            "The batch size of Input and Length should be equal."));
+    PADDLE_ENFORCE_EQ(in_dims[2],
+                      transition_dims[0],
+                      phi::errors::InvalidArgument(
+                          "The number of tags of Input (%d) and Transition "
+                          "(%d) should be equal.",
+                          transition_dims[0],
+                          in_dims[2]));
+  }
+  scores->set_dims(length_dims);
+  scores->set_dtype(length.dtype());
+}
+
 void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 4dec1442516..da48641dee7 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -53,6 +53,14 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
                            const MetaTensor& updates,
                            MetaTensor* out);
 
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config = MetaConfig());
+
 void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    const MetaTensor& weight,
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
new file mode 100644
index 00000000000..fab49f54160
--- /dev/null
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    int64_t height = pre * post;
+    int64_t width = n;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+// Reduce
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int64_t i = 0; i < height; ++i) {
+      int64_t h = i / post;
+      int64_t w = i % post;
+      IndType max_idx = -1;
+      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
+      for (int64_t j = 0; j < width; ++j) {
+        if (in_data[h * width * post + j * post + w] > max_value) {
+          max_value = in_data[h * width * post + j * post + w];
+          max_idx = j;
+        }
+      }
+      out_data[i] = max_value;
+      out_idx_data[i] = max_idx;
+    }
+  }
+};
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int end,
+                  int64_t scale) {
+    for (int i = 0; i < end; ++i) {
+      data[i] = i * scale;
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    auto input_ptr = input.data<T>();
+    auto num = input.numel();
+    *max_value = *std::max_element(input_ptr, input_ptr + num);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    funcs::SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(
+        lhs, rhs, mask);
+  }
+};
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    if (lhs.dims() == rhs.dims()) {
+      funcs::SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
+    } else {
+      bool is_multi_threads = false;
+#ifdef PADDLE_WITH_MKLML
+      if (omp_get_max_threads() > 1) {
+        is_multi_threads = true;
+      }
+#endif
+      if (is_multi_threads) {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(
+            lhs, rhs, output);
+      } else {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(
+            lhs, rhs, output);
+      }
+    }
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, CPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/viterbi_decode_functor.h b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
new file mode 100644
index 00000000000..b80fd5356b6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T, typename Functor, typename OutT = T>
+void SameDimsBinaryOP(const DenseTensor& lhs,
+                      const DenseTensor& rhs,
+                      DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  OutT* out_ptr = out->data<OutT>();
+  Functor functor;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
+  }
+}
+
+template <bool is_multi_threads>
+struct GetInputIndex {
+  void operator()(const std::vector<int>& lhs_dims,
+                  const std::vector<int>& rhs_dims,
+                  const std::vector<int>& output_dims,
+                  const std::vector<int>& lhs_strides,
+                  const std::vector<int>& rhs_strides,
+                  const std::vector<int>& output_strides,
+                  int output_idx,
+                  int* index_array,
+                  int* lhs_idx,
+                  int* rhs_idx) {
+    int out_dims_size = output_strides.size();
+    for (int j = 0; j < out_dims_size; ++j) {
+      int curr_idx = output_idx / output_strides[j];
+      output_idx %= output_strides[j];
+      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
+      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
+    }
+  }
+};
+
+template <typename T, typename Functor, bool is_multi_threads = false>
+void SimpleBroadcastBinaryOP(const DenseTensor& lhs,
+                             const DenseTensor& rhs,
+                             DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  T* out_ptr = out->data<T>();
+  int out_size = static_cast<int>(out->dims().size());
+  std::vector<int> out_dims(out_size);
+  std::vector<int> lhs_dims(out_size);
+  std::vector<int> rhs_dims(out_size);
+  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
+  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
+  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
+  std::vector<int> output_strides(out_size, 1);
+  std::vector<int> lhs_strides(out_size, 1);
+  std::vector<int> rhs_strides(out_size, 1);
+  std::vector<int> index_array(out_size, 0);
+  // calculate strides
+  for (int i = out_size - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
+    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
+    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
+  }
+  Functor functor;
+  GetInputIndex<is_multi_threads> get_input_index;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    int lhs_idx = 0;
+    int rhs_idx = 0;
+    get_input_index(lhs_dims,
+                    rhs_dims,
+                    out_dims,
+                    lhs_strides,
+                    rhs_strides,
+                    output_strides,
+                    i,
+                    index_array.data(),
+                    &lhs_idx,
+                    &rhs_idx);
+    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
+  }
+}
+
+class TensorBuffer {
+ public:
+  explicit TensorBuffer(const DenseTensor& in) : buffer_(in), offset_(0) {
+    buffer_.Resize({buffer_.numel()});
+  }
+  DenseTensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+    int64_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+    DenseTensor block = buffer_.Slice(offset_, offset_ + size);
+    offset_ += size;
+    block.Resize(shape);
+    return block;
+  }
+
+ private:
+  DenseTensor buffer_;  // need to resize 1-D Tensor
+  int offset_;
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
new file mode 100644
index 00000000000..25d6d46c20b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -0,0 +1,402 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+int64_t ComputeBlockSize(int64_t col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256)
+    return 512;
+  else if (col > 128)
+    return 256;
+  else if (col > 64)
+    return 128;
+  else if (col > 32)
+    return 64;
+  else if (col > 16)
+    return 32;
+  else if (col > 8)
+    return 16;
+  else
+    return 8;
+}
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    std::vector<const DenseTensor*> ins{&lhs, &rhs};
+    std::vector<DenseTensor*> outs{output};
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                   T,
+                                                   T>(
+        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    std::vector<const DenseTensor*> ins = {&lhs, &rhs};
+    std::vector<DenseTensor*> outs = {mask};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
+  }
+};
+
+template <typename T, typename IndType, size_t BlockDim>
+__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
+                                 const int64_t width,      // c
+                                 const int64_t post_size,  // h
+                                 const T* in,
+                                 IndType* out_idx,
+                                 T* out) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  cub::ArgMax reducer;
+  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    cub::KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      // return max, argmax
+      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
+      if (out != nullptr) out[idx] = kv_pair.value;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int start = idx; idx < num; idx += gridDim.x) {
+    data[idx] = idx * scale;
+  }
+}
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int num,
+                  int64_t scale) {
+    int64_t kBlockDim = ComputeBlockSize(num);
+    // kBlockDim > num at most of time, so we can set grid = 1
+    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
+  }
+};
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t numel = input.numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    auto cu_stream = dev_ctx.stream();
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = pre * post;
+    int64_t width = n;
+    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           IndType,
+                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, in_data, out_idx_data, out_data));
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    DenseTensor out_data;
+    out_data.Resize(phi::make_ddim({1}));
+    dev_ctx.template Alloc<T>(&out_data);
+    switch (ComputeBlockSize(input.numel())) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           T,
+                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
+              1,
+              input.numel(),
+              1,
+              input.data<int64_t>(),
+              nullptr,
+              out_data.data<int64_t>()));
+    }
+    DenseTensor max_value_tensor;
+    phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
+    *max_value = max_value_tensor.data<T>()[0];
+  }
+};
+
+template <typename Context, typename T, typename IndexT>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, GPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/viterbi_decode_kernel.h b/paddle/phi/kernels/viterbi_decode_kernel.h
new file mode 100644
index 00000000000..27eb94d89ce
--- /dev/null
+++ b/paddle/phi/kernels/viterbi_decode_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path);
+
+}  // namespace phi
-- 
GitLab


From 843f6da05d40e99f252e51c14fa558af1e6ee099 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Thu, 10 Mar 2022 09:43:39 +0800
Subject: [PATCH 225/272] rm relu xpu_kp registry (#40364)

---
 paddle/fluid/operators/activation_op.kps | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 208abd0949a..92a101451e2 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -1553,8 +1553,6 @@ FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
                                CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor,
-                               CudaReluGradFunctor);
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                CudaSigmoidGradFunctor);
 REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
-- 
GitLab


From 99fc1b084dbca6ad4f1c0137548ca8a308f1d819 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 10 Mar 2022 09:55:41 +0800
Subject: [PATCH 226/272] Move dropout to phi (#40148)

* move dropout to phi; test=develop

* fix xpu, npu compile error; test=develop
---
 .../inference/tensorrt/convert/dropout_op.cc  |   2 +-
 .../tensorrt/convert/test_dropout_op.cc       |   2 +-
 paddle/fluid/operators/assign_op_npu_test.cc  |   1 -
 .../collective/c_allgather_op_npu_test.cc     |   1 -
 .../collective/c_allreduce_max_op_npu_test.cc |   1 -
 .../collective/c_allreduce_sum_op_npu_test.cc |   1 -
 .../collective/c_broadcast_op_npu_test.cc     |   1 -
 .../collective/c_reduce_sum_op_npu_test.cc    |   1 -
 .../collective/c_reducescatter_op_npu_test.cc |   1 -
 .../c_sync_comm_stream_op_npu_test.cc         |   1 -
 .../collective/checknumeric_npu_test.cc       |   1 -
 .../collective/recv_v2_op_npu_test.cc         |   1 -
 .../collective/send_v2_op_npu_test.cc         |   1 -
 paddle/fluid/operators/dropout_impl.cu.h      |  27 ++--
 paddle/fluid/operators/dropout_impl_util.h    |   2 +-
 paddle/fluid/operators/dropout_op.cc          |  13 +-
 paddle/fluid/operators/dropout_op.cu          |  94 -----------
 paddle/fluid/operators/dropout_op.h           | 151 ------------------
 paddle/fluid/operators/dropout_op_npu.cc      |   2 +-
 paddle/fluid/operators/dropout_op_test.cc     |   3 +-
 paddle/fluid/operators/dropout_op_xpu.cc      |   4 +-
 .../elementwise/elementwise_op_npu_test.cc    |   1 -
 paddle/fluid/operators/expand_op_npu_test.cc  |   1 -
 paddle/fluid/operators/fused/fmha_ref.h       |  11 +-
 .../operators/fused/fused_dropout_test.h      |   2 +-
 paddle/fluid/operators/gelu_op_npu_test.cc    |   1 -
 .../fluid/operators/increment_op_npu_test.cc  |   1 -
 paddle/fluid/operators/range_op_npu_test.cc   |   1 -
 paddle/fluid/operators/rnn_op.h               |  10 +-
 paddle/fluid/operators/softmax_op_npu_test.cc |   1 -
 paddle/fluid/operators/squeeze_op_npu_test.cc |   1 -
 .../fluid/operators/transpose_op_npu_test.cc  |   1 -
 .../fluid/operators/unsqueeze_op_npu_test.cc  |   1 -
 paddle/phi/kernels/cpu/dropout_grad_kernel.cc |  67 ++++++++
 paddle/phi/kernels/cpu/dropout_kernel.cc      | 104 ++++++++++++
 paddle/phi/kernels/dropout_grad_kernel.h      |  31 ++++
 paddle/phi/kernels/dropout_kernel.h           |  34 ++++
 paddle/phi/kernels/gpu/dropout_grad_kernel.cu |  46 ++++++
 paddle/phi/kernels/gpu/dropout_kernel.cu      |  61 +++++++
 paddle/phi/ops/compat/dropout_sig.cc          |  38 +++++
 .../fluid/tests/unittests/test_dropout_op.py  |  60 +++++++
 41 files changed, 481 insertions(+), 303 deletions(-)
 delete mode 100644 paddle/fluid/operators/dropout_op.cu
 delete mode 100644 paddle/fluid/operators/dropout_op.h
 create mode 100644 paddle/phi/kernels/cpu/dropout_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/dropout_kernel.cc
 create mode 100644 paddle/phi/kernels/dropout_grad_kernel.h
 create mode 100644 paddle/phi/kernels/dropout_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/dropout_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/dropout_kernel.cu
 create mode 100644 paddle/phi/ops/compat/dropout_sig.cc

diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 8c61200f7f5..b69292827aa 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index 474fd92071f..cf377396087 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 72488a932d9..b452dea8536 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index c0968581acd..7206dd01bca 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 31b00a93f13..0946ad8aca6 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 9c11704704e..61e5f279034 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 5787090e6a5..cf4d6a28744 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c79b2f92b69..c4e410d04da 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index d9a7a4abb08..8b498787c69 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index b8abf458c1c..133085ad3f3 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index bb78971734b..36c6f4fadd0 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 8f7b8c4a904..6e02d362156 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index c40b2c3e76a..57e3dd53cc7 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 17665ad67e4..144198367d5 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,10 +32,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
@@ -177,12 +176,13 @@ __global__ void DropoutGradCUDAKernel(
 }
 
 template <typename T>
-void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              bool is_test,
+void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
                               const std::string dropout_implementation,
                               float dropout_prob, bool upscale_in_train,
-                              bool is_fix_seed, int seed_val, const Tensor& x,
-                              const Tensor* seed, Tensor* mask, Tensor* y) {
+                              bool is_fix_seed, int seed_val,
+                              const framework::Tensor& x,
+                              const framework::Tensor* seed,
+                              framework::Tensor* mask, framework::Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
   int64_t x_numel = x.numel();
   auto stream = dev_ctx.stream();
@@ -220,7 +220,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
     int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
         ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
 
@@ -278,11 +279,13 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 }
 
 template <typename T>
-void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
+void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
-                                float dropout_prob, const Tensor& grad_y,
-                                const Tensor& mask, int64_t size,
-                                Tensor* grad_x, bool is_test = false) {
+                                float dropout_prob,
+                                const framework::Tensor& grad_y,
+                                const framework::Tensor& mask, int64_t size,
+                                framework::Tensor* grad_x,
+                                bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
   auto stream = dev_ctx.stream();
   MT factor;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index d7db7dddce3..c62d45570ba 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
+inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const framework::Tensor* seed,
                                     const bool is_fix_seed, const int seed_val,
                                     const int offset, uint64_t* seed_data,
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 7613b04bccf..6d52ce45c4c 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
                   ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
deleted file mode 100644
index f6ddff1d032..00000000000
--- a/paddle/fluid/operators/dropout_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx = context.cuda_device_context();
-    auto* mask = context.Output<Tensor>("Mask");
-    mask->mutable_data<uint8_t>(context.GetPlace());
-
-    bool is_fix_seed = context.Attr<bool>("fix_seed");
-    int seed_val = context.Attr<int>("seed");
-    DropoutFwGPUKernelDriver<T>(dev_ctx, is_test, dropout_implementation,
-                                dropout_prob, upscale_in_train, is_fix_seed,
-                                seed_val, *x, seed, mask, y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUDropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-    auto size = grad_x->numel();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    DropoutGradGPUKernelDriver<T>(dev_ctx, dropout_implementation, dropout_prob,
-                                  *grad_y, *mask, size, grad_x, is_test);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
deleted file mode 100644
index ea6ed0e6194..00000000000
--- a/paddle/fluid/operators/dropout_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = phi::product(mask->dims());
-
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
-        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
-        return;
-      }
-      // std::minstd_rand engine;
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      int seed_data = 0;
-      if (seed) {
-        seed_data = *(seed->data<int>());
-      } else {
-        seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
-      }
-      auto engine = framework::GetCPURandomEngine(seed_data);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(*engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          if (upscale_in_train) {
-            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
-          } else {
-            y_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
-      } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto dX = EigenVector<T>::Flatten(*grad_x);
-    auto dY = EigenVector<T>::Flatten(*grad_y);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    if (context.Attr<bool>("is_test") == true) {
-      if (dropout_implementation == "upscale_in_train") {
-        dX.device(place) = static_cast<T>(1) * dY;
-      } else {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
-      }
-    } else {
-      auto M = EigenVector<uint8_t>::Flatten(*mask);
-      if (dropout_implementation == "upscale_in_train") {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        if (dropout_prob == 1.0f) {
-          dX.device(place) = static_cast<T>(0) * dY;
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
-      } else {
-        dX.device(place) = dY * M.cast<T>();
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 6aae5667606..07b3b538116 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 206d9a6c5e9..bdf08646f1d 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 07b7e2cc7c0..7d8660f238a 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/dropout_op.h"
+
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
 #ifdef PADDLE_WITH_XPU
 
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class DropoutXPUKernel : public framework::OpKernel<T> {
   using XPUTyp = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index fc128a88f20..3e9263fe93a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index cdd4e1dbaae..df00ae54c10 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 02027767579..3c9e16785ea 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -140,9 +140,9 @@ class FMHARef {
 
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
-          dev_ctx_, dropout_param_.is_test_,
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          dropout_param_.is_test_, static_cast<const std::string>(
+                                       dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
@@ -242,8 +242,9 @@ class FMHARef {
     // dropout bw
     if (dropout_param_.dropout_prob_) {
       DropoutGradGPUKernelDriver<T>(
-          dev_ctx_, static_cast<const std::string>(
-                        dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_,
           static_cast<const Tensor&>(*dropout_out_grad_tensor),
           dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index d7952df470d..18c7187fc8e 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -31,7 +31,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 USE_OP(layer_norm);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index 00ff7ad2166..f3ac5313832 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index 09f4e63943a..8324a6215bc 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 24741efe426..c7e91ba35de 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index b636184ae45..a473b54c1f8 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <type_traits>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/unique_op.h"
@@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 using TensorList = std::vector<framework::Tensor>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)                      \
   inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \
     const std::string& mode = ctx.Attr<std::string>("mode");           \
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 3bc55fafd81..3148b31a832 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 956544c5360..d61f5aa3f63 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 5617d728a51..fb39034c8e9 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 3e11c952d15..a8ced783744 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
new file mode 100644
index 00000000000..b77a6c55b14
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  auto* grad_x = x_grad;
+  auto* grad_y = &out_grad;
+  grad_x->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto dX = EigenVector<T>::Flatten(*grad_x);
+  auto dY = EigenVector<T>::Flatten(*grad_y);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto& dropout_implementation = mode;
+  if (is_test == true) {
+    if (dropout_implementation == "upscale_in_train") {
+      dX.device(place) = static_cast<T>(1) * dY;
+    } else {
+      dX.device(place) = dY * static_cast<T>(1.0f - p);
+    }
+  } else {
+    auto M = EigenVector<uint8_t>::Flatten(mask);
+    if (dropout_implementation == "upscale_in_train") {
+      if (p == 1.0f) {
+        dX.device(place) = static_cast<T>(0) * dY;
+      } else {
+        dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+      }
+    } else {
+      dX.device(place) = dY * M.cast<T>();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
new file mode 100644
index 00000000000..c00aedef8c6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_kernel.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  auto* y = out;
+  const auto* x_data = x.data<T>();
+  auto* y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+
+  auto& dropout_implementation = mode;
+  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+  if (!is_test) {
+    auto* mask_data = mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+    size_t size = phi::product(mask->dims());
+
+    // Special case when dropout_prob is 1.0
+    if (dropout_prob == 1.0f) {
+      std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
+      std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
+      return;
+    }
+    // std::minstd_rand engine;
+    // NOTE: fixed seed should only be used in unittest or for debug.
+    // Guarantee to use random seed in training.
+    int seed_data = 0;
+    if (seed_tensor.get_ptr() != nullptr) {
+      seed_data = *(seed_tensor->data<int>());
+    } else {
+      seed_data = fix_seed ? seed : 0;
+    }
+    auto engine = paddle::framework::GetCPURandomEngine(seed_data);
+
+    std::uniform_real_distribution<float> dist(0, 1);
+
+    for (size_t i = 0; i < size; ++i) {
+      if (dist(*engine) < dropout_prob) {
+        mask_data[i] = 0;
+        y_data[i] = 0;
+      } else {
+        mask_data[i] = 1;
+        if (upscale_in_train) {
+          y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+        } else {
+          y_data[i] = x_data[i];
+        }
+      }
+    }
+  } else {
+    if (upscale_in_train) {
+      const auto* X_data = x.data<T>();
+      auto* Y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < x.numel(); i++) {
+        Y_data[i] = X_data[i];
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place = *dev_ctx.eigen_device();
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h
new file mode 100644
index 00000000000..ae3f8205663
--- /dev/null
+++ b/paddle/phi/kernels/dropout_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
new file mode 100644
index 00000000000..dc9f89e08e1
--- /dev/null
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
new file mode 100644
index 00000000000..94d4942a418
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto size = x_grad->numel();
+  paddle::operators::DropoutGradGPUKernelDriver<T>(
+      dev_ctx, mode, p, out_grad, mask, size, x_grad, is_test);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
new file mode 100644
index 00000000000..bd1683ad0c7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+  bool upscale_in_train = (mode == "upscale_in_train");
+  mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+
+  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                                 is_test,
+                                                 mode,
+                                                 dropout_prob,
+                                                 upscale_in_train,
+                                                 fix_seed,
+                                                 seed,
+                                                 x,
+                                                 seed_tensor.get_ptr(),
+                                                 mask,
+                                                 out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc
new file mode 100644
index 00000000000..6bf229c98bd
--- /dev/null
+++ b/paddle/phi/ops/compat/dropout_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DropoutOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "dropout",
+      {"X", "Seed"},
+      {"dropout_prob", "is_test", "dropout_implementation", "seed", "fix_seed"},
+      {"Out", "Mask"});
+}
+
+KernelSignature DropoutGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("dropout_grad",
+                         {"Mask", GradVarName("Out")},
+                         {"dropout_prob", "is_test", "dropout_implementation"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dropout, phi::DropoutOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_grad, phi::DropoutGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index f670f7c3809..fd2f642b770 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -933,5 +933,65 @@ class TestDropoutWithDeterminateSeedGenerator(unittest.TestCase):
             self.check_static_result(place=place)
 
 
+class TestDropoutBackward(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def cal_grad_upscale_train(self, mask, prob):
+        return mask.astype("float32") / (1 - prob)
+
+    def cal_grad_downscale_in_infer(self, mask):
+        return mask.astype("float32")
+
+    def test_backward_downscale_in_infer(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', 0.5)
+                out.backward()
+
+                self.assertTrue(
+                    np.array_equal(input.gradient(
+                    ), self.cal_grad_downscale_in_infer(mask.numpy())))
+
+    def test_backward_upscale_train(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.5
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+    def test_backward_upscale_train_2(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.3
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From e72ef603b43b054f3d7787bd34000e759f88a365 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Thu, 10 Mar 2022 09:56:41 +0800
Subject: [PATCH 227/272] Add trt execute (#40224)

* add trt.execute

* merge trt.engine type

* update return op

* update comments

* fix style

* fix style
---
 paddle/infrt/dialect/infrt/infrt_ops.td       | 16 ++++++
 paddle/infrt/dialect/pd_ops.cc                |  1 -
 paddle/infrt/dialect/pd_ops.h                 |  1 +
 ...rt_dilaect_types.h => trt_dialect_types.h} |  2 +
 .../dialect/tensorrt/trt_graph_fuse_pass.cc   | 20 +++----
 .../dialect/tensorrt/trt_graph_fuse_pass.h    | 32 +++++------
 .../dialect/tensorrt/trt_graph_split_pass.cc  |  8 +--
 .../dialect/tensorrt/trt_graph_split_pass.h   | 19 +++----
 .../dialect/tensorrt/trt_op_converter_pass.cc | 53 +++++++++++++++++++
 .../dialect/tensorrt/trt_op_converter_pass.h  | 22 ++++----
 .../dialect/tensorrt/trt_op_teller_pass.cc    |  9 ++--
 .../dialect/tensorrt/trt_op_teller_pass.h     | 27 ++++------
 paddle/infrt/dialect/tensorrt/trt_ops.cc      | 12 ++++-
 paddle/infrt/dialect/tensorrt/trt_ops.h       |  2 +
 paddle/infrt/dialect/tensorrt/trt_ops.td      | 16 ++++--
 paddle/infrt/tests/dialect/trt_ops.mlir       | 11 +---
 tools/infrt/custom_pdop.td                    |  2 +-
 17 files changed, 162 insertions(+), 91 deletions(-)
 rename paddle/infrt/dialect/tensorrt/{trt_dilaect_types.h => trt_dialect_types.h} (91%)

diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index ecd7093e72b..e07a598d9bc 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -18,6 +18,22 @@ def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
   let results = (outs Variadic<AnyType>);
 }
 
+def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<(ins),
+                  [{ build($_builder, $_state, llvm::None); }]>];
+}
+
 def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
   let summary = "convert tensor type op";
   let description = [{convert tensor type op!}];
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 338b04e0013..55ab174fcaf 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -16,7 +16,6 @@
 
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/PatternMatch.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index b48c68060d4..41dd2ddd94e 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
 namespace mlir {
 namespace pd {
diff --git a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
similarity index 91%
rename from paddle/infrt/dialect/tensorrt/trt_dilaect_types.h
rename to paddle/infrt/dialect/tensorrt/trt_dialect_types.h
index efcf7dd5be1..0c3edcec1ed 100644
--- a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h
+++ b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
@@ -23,6 +23,8 @@ class EngineType
     : public mlir::Type::TypeBase<EngineType, mlir::Type, mlir::TypeStorage> {
  public:
   using Base::Base;
+  static EngineType get();
+  static EngineType get(mlir::MLIRContext *context);
 };
 
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index fa0095363c5..ad6b136463a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -53,9 +53,9 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
-                                    CreateEngineOp first,
-                                    CreateEngineOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             mlir::pd::GraphOp first,
+                             mlir::pd::GraphOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,8 +84,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op =
-      builder.create<CreateEngineOp>(loc, return_types, inputs, true);
+  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -98,7 +97,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<::infrt::dialect::ReturnOp>(loc, outputs);
+  builder.create<::infrt::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
@@ -150,12 +149,13 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+      mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        CreateEngineOp user_graph_op =
-            ::llvm::dyn_cast_or_null<CreateEngineOp>(user_op);
+        mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
@@ -168,7 +168,7 @@ void TRTGraphFusePass::runOnFunction() {
         // Reverse DFS from the source_nodes.
         if (!reverseDfs(source_nodes,
                         [&op](const mlir::Operation *n) { return n == &op; })) {
-          mergeTwoAdjacentCreateEngineOp(builder, graph_op, user_graph_op);
+          mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
         }
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index 350add905aa..803e53e3244 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
 #include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,40 +25,37 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" %m
+ *     "infrt.return" (%m)
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" %m
+ *      "infrt.return" (%m)
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" %m
+ *      "infrt.return" (%m)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "infrt.return" (%d, %f)..
+ * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     "infrt.return" (%n, %s)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "infrt.return" (%d, %f)
  * }
  */
 class TRTGraphFusePass
     : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 5ee7b23213a..e3a7b455024 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -16,23 +16,23 @@
 
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<CreateEngineOp> worklist;
+  std::vector<mlir::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+    mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    CreateEngineOp graph_op = worklist.back();
+    mlir::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 28078e2bc2d..1c44a13cf9d 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
 #include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -27,33 +26,29 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" (%n, %s)
+ *     "infrt.return" (%n, %s)...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)...
  * }
  */
 class TRTGraphSplitPass
     : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   void runOnFunction() override;
   explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 8d81e739d9c..1be5f4dbc39 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -16,12 +16,64 @@
 #include <mlir/Transforms/DialectConversion.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
+struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
+  PD2TRT_GraphLower(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
+    auto casted_op = ::llvm::dyn_cast<mlir::pd::GraphOp>(op);
+    ::mlir::Operation::operand_range inputs = casted_op.inputs();
+    auto ods_loc = rewriter.getFusedLoc(op->getLoc());
+    CreateEngineOp create_engine_op;
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> trt_inputs;
+    for (auto v : inputs) {
+      trt_inputs.push_back(v);
+    }
+    create_engine_op = rewriter.create<CreateEngineOp>(
+        ods_loc,
+        ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
+        trt_inputs,
+        true /*run_once*/);
+    ::mlir::Block *block = new ::mlir::Block;
+    block->getOperations().splice(block->begin(),
+                                  casted_op.getBody()->getOperations(),
+                                  casted_op.getBody()->begin(),
+                                  casted_op.getBody()->end());
+    create_engine_op.body().push_back(block);
+
+    // trt.execute
+    // outputs
+    ::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types;
+    for (auto v : casted_op.getODSResults(0)) {
+      execute_outputs_types.push_back(v.getType());
+    }
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> execute_inputs(
+        create_engine_op.getODSResults(0));
+    for (auto v : inputs) {
+      execute_inputs.push_back(v);
+    }
+    auto execute_op = rewriter.create<ExecuteOp>(
+        ods_loc, execute_outputs_types, execute_inputs);
+
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(op, replace_values);
+    return ::mlir::success();
+  }
+};
+
 void TRTOpConverterPass::runOnOperation() {
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
@@ -36,6 +88,7 @@ void TRTOpConverterPass::runOnOperation() {
   // the set of patterns that will lower the TensorRT operations.
   ::mlir::RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
+  patterns.add<PD2TRT_GraphLower>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index a8128a585ee..7550d8c84e1 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
@@ -23,27 +24,26 @@ namespace trt {
  * trtOpConverterPass.
  *
  * source ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     "infrt.return" (%n, %s)...
  *   } ...
- *   "pd.fetch" %d, %f
+ *   "infrt.return" (%d, %f)...
  * }
  *
  * destination ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %engine = "trt.create_engine"(%a) ({
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "Infrt.return" %n, %s
- *   } ...
- *   "pd.fetch" %d, %f
+ *     "infrt.return" (%n, %s)...
+ *   }){run_once = true} ...
+ *   %d, %f = "trt.execute"(%engine, %a)...
+ *   "infrt.return" (%d, %f)...
  * }
  */
 struct TRTOpConverterPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 17e893a383a..13b7f1aee55 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -16,6 +16,7 @@
 
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
@@ -37,11 +38,11 @@ void TRTOpTellerPass::runOnFunction() {
     if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<CreateEngineOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<CreateEngineOp>(
-        loc, op->getResultTypes(), op->getOperands(), true);
+    auto graph_op = builder.create<mlir::pd::GraphOp>(
+        loc, op->getResultTypes(), op->getOperands());
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
@@ -54,7 +55,7 @@ void TRTOpTellerPass::runOnFunction() {
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<::infrt::dialect::ReturnOp>(loc, op->getResults());
+    builder.create<::infrt::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 471eafa9f9b..b9e461c8633 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
 #include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,30 +25,28 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return"(%d, %f) ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" (%m)
+ *     "infrt.return" (%m)
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" (%m)
+ *      "infrt.return" (%m)
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" (%m)
+ *      "infrt.return" (%m)
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
@@ -57,9 +54,7 @@ namespace trt {
 class TRTOpTellerPass
     : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index f179939e232..d5222976625 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
@@ -19,11 +18,20 @@
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/tensorrt/trt_dilaect_types.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
+EngineType EngineType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+EngineType EngineType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
 TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
     : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
   addTypes<EngineType>();
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 978b9906e5f..44444232915 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -29,6 +29,8 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 31142a5157b..132a1d7805b 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,14 +7,24 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
-def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::dialect::ReturnOp">]> {
-  let summary = "trt Graph Op";
+
+def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
+  let summary = "trt CreateEngine Op";
   let description = [{
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
   let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs Variadic<TRT_Tensor>:$outputs);
+  let results = (outs TRT_EngineType:$output);
+}
+
+def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
+  let summary = "trt execute Op";
+  let description = [{
+    Describe a tensorrt runtime.
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
+  let results = (outs Variadic<TRT_Tensor>:$output);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index 49510bc542d..6d25044d139 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,13 +1,6 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main() -> tensor<?xf32> {
-  %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
-  %c = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
-  %b1 = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
-  %b2 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
-  %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
-  %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
-
+func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
   %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
@@ -19,5 +12,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
+  "infrt.return"(%e2) : (tensor<?xf32>)->()
 }
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index 2139fbd8155..f7547672595 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -33,7 +33,7 @@ def PD_ReturnOp : PD_Op<"return", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
-- 
GitLab


From 1128db30cfc03dc9a17da4221c76c3e60b98b7f5 Mon Sep 17 00:00:00 2001
From: helen88 <z8hanghuan@126.com>
Date: Thu, 10 Mar 2022 09:59:38 +0800
Subject: [PATCH 228/272] add tril_triu for xpu, *test=kunlun (#40246)

* add tril_triu for xpu, *test=kunlun

* add tril_triu for xpu, *test=kunlun

* add tril_triu for xpu, *test=kunlun

* add tril_triu for xpu, *test=kunlun

* add tril_triu for xpu, *test=kunlun
---
 cmake/external/xpu.cmake                      |   2 +-
 paddle/fluid/operators/tril_triu_op_xpu.cc    |  53 +++++++
 .../fluid/platform/device/xpu/xpu2_op_list.h  |   4 +
 .../unittests/xpu/test_tril_triu_op_xpu.py    | 143 ++++++++++++++++++
 4 files changed, 201 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/tril_triu_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 45a76fdc1f1..cfbe68eecba 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
new file mode 100644
index 00000000000..e36cbcf228c
--- /dev/null
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  Licensed under
+the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TrilTriuXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::Tensor>("X");
+    const auto* x_data = x->data<T>();
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+
+    const int diagonal = context.Attr<int>("diagonal");
+    const bool lower = context.Attr<bool>("lower");
+    auto xshape = phi::vectorize<int>(x->dims());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    if (lower) {
+      r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+    } else {
+      r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    tril_triu, ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e6b08ed7bc3..3789ec322ac 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -323,6 +323,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -349,6 +351,8 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace())})},
       {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::BOOL, XPUPlace()),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
new file mode 100644
index 00000000000..785549abba8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.tensor as tensor
+import unittest
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from paddle.fluid.framework import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestTrilTriuOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tril_triu'
+        self.use_dynamic_create_class = False
+
+    class TestTrilTriuOp(XPUOpTest):
+        def setUp(self):
+            self.init_dtype()
+            self.initTestCase()
+            self.real_op_type = np.random.choice(['triu', 'tril'])
+            self.real_np_op = getattr(np, self.real_op_type)
+            self.set_xpu()
+            self.op_type = "tril_triu"
+            if self.dtype == np.int32:
+                self.X = np.arange(
+                    1, self.get_Xshape_prod() + 1,
+                    dtype=self.dtype).reshape(self.Xshape)
+            else:
+                self.X = np.random.random(self.Xshape).astype(dtype=self.dtype)
+            self.inputs = {'X': self.X}
+            self.attrs = {
+                'diagonal': self.diagonal,
+                'lower': True if self.real_op_type == 'tril' else False,
+            }
+            self.outputs = {
+                'Out': self.real_np_op(self.X, self.diagonal)
+                if self.diagonal else self.real_np_op(self.X)
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def get_Xshape_prod(self):
+            ret = 1
+            for v in self.Xshape:
+                ret *= v
+            return ret
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.real_op_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+        def initTestCase(self):
+            self.diagonal = None
+            self.Xshape = (10, 10)
+
+    class TestTrilTriuOp1(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -3
+            self.Xshape = (5, 5)
+
+    class TestTrilTriuOp2(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 4
+            self.Xshape = (11, 17)
+
+    class TestTrilTriuOp3(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 10
+            self.Xshape = (25, 25)
+
+    class TestTrilTriuOp4(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -10
+            self.Xshape = (33, 11)
+
+    class TestTrilTriuOp5(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 11
+            self.Xshape = (1, 99)
+
+
+class TestTrilTriuOpError(unittest.TestCase):
+    def test_errors1(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(20, 22), dtype='float32', name="data1")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "diagonal: TypeError":
+            "diagonal in {} must be a python Int".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal='2022')
+
+    def test_errors2(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(200, ), dtype='float32', name="data2")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "input: ValueError":
+            "x shape in {} must be at least 2-D".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal=[None])
+
+
+support_types = get_xpu_op_support_types('tril_triu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTrilTriuOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From f25dba0ab0c84bc1e0a20e5c76c9ffef0323d507 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 10 Mar 2022 10:07:42 +0800
Subject: [PATCH 229/272] [PHI] Move arg min max to PHI. (#40222)

* move arg min max to phi.

* move infermeta.

* fix as reviews.
---
 paddle/fluid/operators/arg_max_op.cc          |  24 +-
 paddle/fluid/operators/arg_max_op.cu          |  22 --
 .../fluid/operators/arg_min_max_op_base.cu.h  | 202 -------------
 paddle/fluid/operators/arg_min_max_op_base.h  | 184 ------------
 paddle/fluid/operators/arg_min_op.cc          |  21 +-
 paddle/fluid/operators/arg_min_op.cu          |  21 --
 paddle/phi/infermeta/unary.cc                 |  77 +++++
 paddle/phi/infermeta/unary.h                  |   8 +
 paddle/phi/kernels/arg_min_max_kernel.h       |  39 +++
 paddle/phi/kernels/cpu/arg_min_max_kernel.cc  | 203 +++++++++++++
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  | 278 ++++++++++++++++++
 11 files changed, 623 insertions(+), 456 deletions(-)
 delete mode 100644 paddle/fluid/operators/arg_max_op.cu
 delete mode 100644 paddle/fluid/operators/arg_min_max_op_base.cu.h
 delete mode 100644 paddle/fluid/operators/arg_min_op.cu
 create mode 100644 paddle/phi/kernels/arg_min_max_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/arg_min_max_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/arg_min_max_kernel.cu

diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 0f5c048b6be..c5e4188ca2d 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -15,23 +15,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
+
 REGISTER_OPERATOR(
     arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMaxInferShapeFunctor);
+
 REGISTER_OP_VERSION(arg_max)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
deleted file mode 100644
index 14708c4df10..00000000000
--- a/paddle/fluid/operators/arg_max_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
deleted file mode 100644
index b77031f7fb4..00000000000
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include <limits>
-#include <string>
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {  // NOLINT
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-using Tensor = framework::Tensor;
-
-}  // end namespace
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-template <typename T, typename IndType, class Reducer, size_t BlockDim>
-__global__ void ArgCUDAKernel(const int64_t height,     // n * h
-                              const int64_t width,      // c
-                              const int64_t post_size,  // h
-                              const Reducer reducer, const T init, const T* in,
-                              IndType* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      out[idx] = static_cast<IndType>(kv_pair.key);
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, typename IndType, class Reducer>
-void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
-                    Tensor* indices, const int64_t pre, const int64_t post,
-                    const int64_t n) {
-  auto cu_stream = ctx.stream();
-  auto ComputeBlockSize = [](int64_t col) {
-    auto block_size = 8;
-    if (col > 512)
-      block_size = 1024;
-    else if (col > 256)
-      block_size = 512;
-    else if (col > 128)
-      block_size = 256;
-    else if (col > 64)
-      block_size = 128;
-    else if (col > 32)
-      block_size = 64;
-    else if (col > 16)
-      block_size = 32;
-    else if (col > 8)
-      block_size = 16;
-#ifdef __HIPCC__
-    block_size = std::min(block_size, 256);
-#endif
-    return block_size;
-  };
-
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t height = pre * post;
-  int64_t width = n;
-  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-  const T* in_data = input.data<T>();
-  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  if (typeid(Reducer) == typeid(cub::ArgMax)) {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
-              in_data, out_data));
-    }
-  } else {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::max(),
-              in_data, out_data));
-    }
-  }
-}
-
-template <typename T, class Reducer>
-struct VisitDataCudaArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename IndType>
-  void apply() const {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int axis = ctx.Attr<int64_t>("axis");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    framework::DDim input_dims;
-    if (flatten) {
-      input_dims = phi::make_ddim({input->numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      input_dims = input->dims();
-      if (axis < 0) axis += input->dims().size();
-    }
-
-    int64_t numel = input->numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-
-    const auto& dev_ctx = ctx.cuda_device_context();
-    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
-  }
-};
-template <typename T, class Reducer>
-class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-  }
-};
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index d3ce61d183a..585341beea1 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -27,193 +27,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum ArgMinMaxType { kArgMin, kArgMax };
-
-template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
-          ArgMinMaxType argMinMaxValue>
-struct ArgMinMaxFunctor {};
-
-#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
-  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
-  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
-                          enum_argminmax_value> {                             \
-    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, framework::DDim x_dims,        \
-                    int64_t axis, bool keepdims) {                            \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
-      if (keepdims) {                                                         \
-        auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      } else {                                                                \
-        auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);  \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      }                                                                       \
-    }                                                                         \
-  }
-
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-struct VisitDataArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename Tout>
-  void apply() const {
-    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
-    out.template mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto keepdims = ctx.Attr<bool>("keepdims");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-    // paddle do not have the scalar tensor, just return the shape [1] tensor
-    if (flatten) keepdims = true;
-
-    // if flatten, will construct the new dims for the cacluate
-    framework::DDim x_dims;
-    if (flatten) {
-      x_dims = phi::make_ddim({x.numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      x_dims = x.dims();
-      if (axis < 0) axis += x_dims.size();
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
-  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
-      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
-
-    switch (x_dims.size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(
-            x_dims.size(), 6,
-            platform::errors::InvalidArgument(
-                "%s operator doesn't supports tensors whose ranks are greater "
-                "than 6.",
-                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-class ArgMinMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-  }
-};
-
-template <typename DeviceContext, typename T>
-using ArgMinKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMin>;
-
-template <typename DeviceContext, typename T>
-using ArgMaxKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMax>;
-
 class ArgMinMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max");
-    const auto& x_dims = ctx->GetInputDim("X");
-    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
-    bool keepdims = ctx->Attrs().Get<bool>("keepdims");
-    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
-
-    PADDLE_ENFORCE_GE(axis, -x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -Rank(X)(%d).",
-                          axis, -x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
-            x_dims.size()));
-
-    const int& dtype = ctx->Attrs().Get<int>("dtype");
-    PADDLE_ENFORCE_EQ(
-        (dtype < 0 || dtype == 2 || dtype == 3), true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
-    if (ctx->IsRuntime()) {
-      if (dtype == framework::proto::VarType::INT32) {
-        int64_t all_element_num = 0;
-        if (flatten) {
-          all_element_num = phi::product(x_dims);
-
-        } else {
-          all_element_num = x_dims[axis];
-        }
-        PADDLE_ENFORCE_LE(
-            all_element_num, INT_MAX,
-            platform::errors::InvalidArgument(
-                "The element num of the argmin/argmax input at axis is "
-                "%d, is larger than int32 maximum value:%d, you must "
-                "set the dtype of argmin/argmax to 'int64'.",
-                all_element_num, INT_MAX));
-      }
-    }
-    std::vector<int64_t> vec;
-    if (flatten) {
-      vec.emplace_back(static_cast<int64_t>(1));
-    } else {
-      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
-      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(vec));
-  }
 };
 
 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 0a4ba6fb0bf..fb3abd01af8 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
 
 REGISTER_OPERATOR(
     arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMinInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
 REGISTER_OP_VERSION(arg_min)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
deleted file mode 100644
index 23170bf0087..00000000000
--- a/paddle/fluid/operators/arg_min_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-REGISTER_OP_CUDA_KERNEL(
-    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 544a5593014..8c2707e1d23 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
@@ -1014,6 +1015,82 @@ void DiagInferMeta(const MetaTensor& x,
   }
 }
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  const auto& x_dims = x.dims();
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -x_dims.size(),
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -Rank(X)(%d).",
+                                   axis,
+                                   -x_dims.size()));
+  PADDLE_ENFORCE_LT(axis,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+                        axis,
+                        x_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      (dtype < 0 || dtype == 2 || dtype == 3),
+      true,
+      phi::errors::InvalidArgument(
+          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+          "received [%s]",
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT32),
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT64),
+          paddle::framework::DataTypeToString(
+              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
+
+  auto x_rank = x_dims.size();
+  if (axis < 0) axis += x_rank;
+  if (config.is_runtime) {
+    if (dtype == paddle::framework::proto::VarType::INT32) {
+      int64_t all_element_num = 0;
+      if (flatten) {
+        all_element_num = phi::product(x_dims);
+
+      } else {
+        all_element_num = x_dims[axis];
+      }
+      PADDLE_ENFORCE_LE(
+          all_element_num,
+          INT_MAX,
+          phi::errors::InvalidArgument(
+              "The element num of the argmin/argmax input at axis is "
+              "%d, is larger than int32 maximum value:%d, you must "
+              "set the dtype of argmin/argmax to 'int64'.",
+              all_element_num,
+              INT_MAX));
+    }
+  }
+  std::vector<int64_t> vec;
+  if (flatten) {
+    vec.emplace_back(static_cast<int64_t>(1));
+  } else {
+    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
+  }
+  out->set_dims(phi::make_ddim(vec));
+  if (dtype == 2) {
+    out->set_dtype(DataType::INT32);
+  } else if (dtype == 3) {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
   out->set_dtype(DataType::INT64);
   out->set_dims({1});
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index c57e1bdec8d..df9258644ac 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -147,6 +147,14 @@ void DiagInferMeta(const MetaTensor& x,
                    float padding_value,
                    MetaTensor* out);
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
+
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
 
 void DiagonalInferMeta(
diff --git a/paddle/phi/kernels/arg_min_max_kernel.h b/paddle/phi/kernels/arg_min_max_kernel.h
new file mode 100644
index 00000000000..917babeef07
--- /dev/null
+++ b/paddle/phi/kernels/arg_min_max_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
new file mode 100644
index 00000000000..f4ad830e149
--- /dev/null
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename Context,
+          typename T,
+          typename Tout,
+          int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)  \
+  template <typename Context, typename T, typename Tout, int64_t Rank>    \
+  struct ArgMinMaxFunctor<Context, T, Tout, Rank, enum_argminmax_value> { \
+    void operator()(const Context& dev_ctx,                               \
+                    const DenseTensor& in,                                \
+                    DenseTensor* out,                                     \
+                    phi::DDim x_dims,                                     \
+                    int64_t axis,                                         \
+                    bool keepdims) {                                      \
+      auto in_eigen = EigenTensor<T, Rank>::From(in, x_dims);             \
+      if (keepdims) {                                                     \
+        auto out_eigen = EigenTensor<Tout, Rank>::From(*out);             \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      } else {                                                            \
+        auto out_eigen = EigenTensor<Tout, Rank - 1>::From(*out);         \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      }                                                                   \
+    }                                                                     \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+struct VisitDataArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataArgMinMaxFunctor(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     int64_t axis,
+                                     bool keepdims,
+                                     bool flatten,
+                                     DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+  template <typename Tout>
+  void apply() const {
+    dev_ctx.template Alloc<Tout>(out);
+    bool new_keepdims = keepdims;
+    if (flatten) new_keepdims = true;
+
+    // if flatten, will construct the new dims for the cacluate
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x_dims.size();
+    }
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
+  ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
+  functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
+
+    switch (x_dims.size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            x_dims.size(),
+            6,
+            phi::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+void ArgMinMaxKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int64_t axis,
+                     bool keepdims,
+                     bool flatten,
+                     int dtype,
+                     DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
new file mode 100644
index 00000000000..6feee512cc9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include <limits>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer,
+                              const T init,
+                              const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const phi::GPUContext& dev_ctx,
+                    const DenseTensor& input,
+                    DenseTensor* indices,
+                    const int64_t pre,
+                    const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = dev_ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    auto block_size = 8;
+    if (col > 512)
+      block_size = 1024;
+    else if (col > 256)
+      block_size = 512;
+    else if (col > 128)
+      block_size = 256;
+    else if (col > 64)
+      block_size = 128;
+    else if (col > 32)
+      block_size = 64;
+    else if (col > 16)
+      block_size = 32;
+    else if (col > 8)
+      block_size = 16;
+#ifdef __HIPCC__
+    block_size = std::min(block_size, 256);
+#endif
+    return block_size;
+  };
+
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = dev_ctx.template Alloc<IndType>(indices);
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::lowest(),
+              in_data,
+              out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::max(),
+              in_data,
+              out_data));
+    }
+  }
+}
+
+template <typename Context, typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         int64_t axis,
+                                         bool keepdims,
+                                         bool flatten,
+                                         DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+
+  template <typename IndType>
+  void apply() const {
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x.dims().size();
+    }
+
+    int64_t numel = x.numel();
+    int64_t groups = numel / x_dims[new_axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = x_dims[new_axis];
+
+    for (int i = 0; i < new_axis; i++) {
+      pre *= x_dims[i];
+    }
+
+    for (int i = new_axis + 1; i < x_dims.size(); i++) {
+      post *= x_dims[i];
+    }
+
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, x, out, pre, post, n);
+  }
+};
+
+template <typename Context, typename T, class Reducer>
+void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           int64_t axis,
+                           bool keepdims,
+                           bool flatten,
+                           int dtype,
+                           DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+#endif
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
-- 
GitLab


From 9262a93c2ef51db2f0b80e6539c5b3e54f2fc6f4 Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Thu, 10 Mar 2022 10:53:42 +0800
Subject: [PATCH 230/272] [phi] move argsort to phi (#40151)

* move argsort to phi

* refine files

* remove mutable_data

* fix unittest

* fix unittest

* remove infershape

* update infershape

* fix ci

* fix ci

* fix ci

* fix

* fix

* fix

* fix

* fix

* fix

* fix
---
 paddle/fluid/operators/argsort_op.cc          |  46 +-
 paddle/fluid/operators/argsort_op.cu          | 430 ------------------
 paddle/fluid/operators/argsort_op.h           | 243 ----------
 paddle/fluid/operators/argsort_op_npu.cc      |   2 +-
 paddle/fluid/operators/argsort_op_xpu.cc      |   2 +-
 paddle/phi/infermeta/unary.cc                 |  28 ++
 paddle/phi/infermeta/unary.h                  |   6 +
 paddle/phi/kernels/argsort_grad_kernel.h      |  30 ++
 paddle/phi/kernels/argsort_kernel.h           |  29 ++
 paddle/phi/kernels/cpu/argsort_grad_kernel.cc | 133 ++++++
 paddle/phi/kernels/cpu/argsort_kernel.cc      | 143 ++++++
 paddle/phi/kernels/gpu/argsort_grad_kernel.cu | 217 +++++++++
 paddle/phi/kernels/gpu/argsort_kernel.cu      | 310 +++++++++++++
 paddle/phi/ops/compat/argsort_sig.cc          |  29 ++
 14 files changed, 936 insertions(+), 712 deletions(-)
 delete mode 100644 paddle/fluid/operators/argsort_op.cu
 delete mode 100644 paddle/fluid/operators/argsort_op.h
 create mode 100644 paddle/phi/kernels/argsort_grad_kernel.h
 create mode 100644 paddle/phi/kernels/argsort_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/argsort_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/argsort_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/argsort_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/argsort_kernel.cu
 create mode 100644 paddle/phi/ops/compat/argsort_sig.cc

diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 9e525c20335..1a8aca77737 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ArgsortOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort");
-
-    auto in_dims = ctx->GetInputDim("X");
-    int axis = ctx->Attrs().Get<int>("axis");
-
-    auto num_dims = in_dims.size();
-    PADDLE_ENFORCE_GE(axis, -num_dims,
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -num_dims(%d).",
-                          axis, -num_dims));
-    PADDLE_ENFORCE_LT(
-        axis, num_dims,
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareDim("X", "Indices");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
 };
 
 class ArgsortGradOp : public framework::OperatorWithKernel {
@@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor,
+                            PD_INFER_META(phi::ArgsortInferMeta));
 REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
                   ops::ArgsortGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>,
+                  ArgsortInferShapeFunctor);
 REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp,
                   ops::ArgsortGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(argsort,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    argsort_grad, ops::ArgsortGradientKernel<paddle::platform::CPUPlace, float>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, double>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
deleted file mode 100644
index 8b7a0b3eadb..00000000000
--- a/paddle/fluid/operators/argsort_op.cu
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/argsort_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#ifdef __HIPCC__
-namespace rocprim {
-namespace detail {
-template <>
-struct radix_key_codec_base<paddle::platform::float16>
-    : radix_key_codec_integral<paddle::platform::float16, uint16_t> {};
-}  // namespace detail
-}  // namespace rocprim
-#else
-// set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-template <typename T>
-static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (T j = row_id; j < num_rows; j += gridDim.x) {
-    for (T i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
-                                       int64_t size, T* dX) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < size; i += stride) {
-    dX[indices[i]] = dO[i];
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
-                                IndType num_rows, IndType num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
-    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
-      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
-    }
-  }
-}
-
-// Sort by flag descending, True: descending. False: Ascending.
-// Default is false.
-template <typename T, typename IndType>
-void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
-                 Tensor* output, Tensor* indices, const IndType num_rows,
-                 const IndType num_cols, const bool descending) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-
-  const std::vector<IndType> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<IndType>(ctx.GetPlace());
-
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  // Init a index array
-  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<IndType>(), num_rows, num_cols);
-
-  T* sorted_out_ptr;
-  IndType* sorted_indices_ptr;
-
-  const T* inp = input->data<T>();
-  T* out = output->mutable_data<T>(ctx.GetPlace());
-  IndType* ind = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  sorted_out_ptr = out;
-  sorted_indices_ptr = ind;
-
-  // create iter for counting input
-  cub::CountingInputIterator<IndType> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<IndType, SegmentOffsetIter,
-                              cub::CountingInputIterator<IndType>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  gpuError_t err;
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-}
-
-template <typename T, typename IndType>
-void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                   const Tensor* indices, Tensor* dX, const IndType num_rows,
-                   const IndType num_cols) {
-  auto cu_stream = ctx.stream();
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<IndType>(), dX->data<T>(), num_rows,
-      num_cols);
-}
-
-template <typename T>
-void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                      const Tensor* indices, int64_t size, Tensor* dX) {
-  auto cu_stream = ctx.stream();
-
-  const int64_t block_size =
-      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
-  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
-  const int64_t max_blocks =
-      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
-  const int64_t grid_size =
-      std::min(max_blocks, (size + block_size - 1) / block_size);
-
-  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    auto size = input->numel();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    // Compared to the following 'Special case for full sort', ascending sort is
-    // 34 times faster and descending sort is 31 times faster.
-    if (size == in_dims[axis]) {
-      thrust::sequence(thrust::device, ids_data, ids_data + size);
-      thrust::copy(thrust::device, in_data, in_data + size, out_data);
-      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
-      if (descending) {
-        thrust::reverse(thrust::device, out_data, out_data + size);
-        thrust::reverse(thrust::device, ids_data, ids_data + size);
-      }
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      ArgFullSort<T, int64_t>(dev_ctx, input, output, indices, input_height,
-                              input_width, descending);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      T* trans_inp_data = trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-      // temp indices for sorting
-      tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-
-      ArgFullSort<T, int64_t>(dev_ctx, &trans_inp, &tmp_out, &tmp_indices,
-                              input_height, input_width, descending);
-
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                   output, trans);
-      return;
-    }
-  }
-};
-
-template <typename T>
-class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    if (dO->numel() == 0) return;
-
-    auto in_dims = dX->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    int64_t size = dX->numel();
-    const auto& dev_ctx = ctx.cuda_device_context();
-
-    // Parallel acceleration when the input size is equal to the length of the
-    // ‘axis’ dimension.
-    // Compared to 'special case for full sort' below, the gradient calculation
-    // is 10 times faster.
-    if (size == in_dims[axis]) {
-      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
-                                input_width);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
-                                                   &trans_dO, trans);
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      ArgFullAssign<T, int64_t>(dev_ctx, &trans_dO, &trans_ind, &tmp_out,
-                                input_height, input_width);
-
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                   trans);
-      return;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    argsort,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
-    paddle::operators::ArgsortGradOpCUDAKernel<double>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
deleted file mode 100644
index d850e51a4bf..00000000000
--- a/paddle/fluid/operators/argsort_op.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename Type>
-static void FullSort(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     bool descending) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                if (descending)
-                  return l.first > r.first;
-                else
-                  return l.first < r.first;
-              });
-
-    for (Type j = 0; j < input_width; ++j) {
-      t_out[i * input_width + j] = col_vec[j].first;
-      t_indices[i * input_width + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullAssign(Type input_height, Type input_width, int input_dim,
-                       const framework::Tensor* input,
-                       const framework::Tensor* indices, T* t_out) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-    // Do full sort
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           out_data, ids_data, descending);
-    } else {
-      // If not full sort do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, descending);
-
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ArgsortGradientKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    // Do full assign
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(), dO,
-                             indices, dX->data<T>());
-    } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *dO,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                  trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 077be715bec..c927eec00bc 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
index 18e81936a16..359b00fcf87 100644
--- a/paddle/fluid/operators/argsort_op_xpu.cc
+++ b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 8c2707e1d23..f7f8612632b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -26,6 +26,34 @@ limitations under the License. */
 
 namespace phi {
 
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices) {
+  auto in_dims = input.dims();
+  auto num_dims = in_dims.size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -num_dims,
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -num_dims(%d).",
+                                   axis,
+                                   -num_dims));
+  PADDLE_ENFORCE_LT(
+      axis,
+      num_dims,
+      phi::errors::InvalidArgument(
+          "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
+
+  output->share_dims(input);
+  output->set_dtype(input.dtype());
+  indices->share_dims(input);
+  indices->set_dtype(DataType::INT64);
+  output->share_lod(input);
+  indices->share_lod(input);
+}
+
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->share_meta(x);
 }
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index df9258644ac..08d05db1e50 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -32,6 +32,12 @@ class MetaConfig;
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
 
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices);
+
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
 
 // meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
diff --git a/paddle/phi/kernels/argsort_grad_kernel.h b/paddle/phi/kernels/argsort_grad_kernel.h
new file mode 100644
index 00000000000..b91bd699113
--- /dev/null
+++ b/paddle/phi/kernels/argsort_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_kernel.h b/paddle/phi/kernels/argsort_kernel.h
new file mode 100644
index 00000000000..683e8631d2e
--- /dev/null
+++ b/paddle/phi/kernels/argsort_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
new file mode 100644
index 00000000000..1e60847232c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullAssign(Type input_height,
+                       Type input_width,
+                       int input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* t_out) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  auto in_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  dev_ctx.template Alloc<T>(in_grad);
+  auto dxt = EigenVector<T>::Flatten(*in_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  // Do full assign
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &out_grad,
+                           &indices,
+                           in_grad->data<T>());
+  } else {
+    // If not full assign do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &trans_dO,
+                           &trans_ind,
+                           t_out);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(argsort_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_kernel.cc b/paddle/phi/kernels/cpu/argsort_kernel.cc
new file mode 100644
index 00000000000..0e69afe38c9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullSort(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     bool descending) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                if (descending)
+                  return l.first > r.first;
+                else
+                  return l.first < r.first;
+              });
+
+    for (Type j = 0; j < input_width; ++j) {
+      t_out[i * input_width + j] = col_vec[j].first;
+      t_indices[i * input_width + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  T* out_data = dev_ctx.template Alloc<T>(output);
+
+  // Do full sort
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &input,
+                         out_data,
+                         ids_data,
+                         descending);
+  } else {
+    // If not full sort do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_dims);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         descending);
+
+    dev_ctx.template Alloc<int64_t>(indices);
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    argsort, CPU, ALL_LAYOUT, phi::ArgsortKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
new file mode 100644
index 00000000000..15bca474f58
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -0,0 +1,217 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO,
+                                       const IndType* indices,
+                                       int64_t size,
+                                       T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
+template <typename T, typename IndType>
+static __global__ void FillGrad(const T* dO,
+                                const IndType* indices,
+                                T* dX,
+                                IndType num_rows,
+                                IndType num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
+    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
+      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
+    }
+  }
+}
+
+template <typename T, typename IndType>
+void ArgFullAssign(const phi::GPUContext& ctx,
+                   const DenseTensor* dO,
+                   const DenseTensor* indices,
+                   DenseTensor* dX,
+                   const IndType num_rows,
+                   const IndType num_cols) {
+  auto cu_stream = ctx.stream();
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(dO->data<T>(),
+                                                    indices->data<IndType>(),
+                                                    dX->data<T>(),
+                                                    num_rows,
+                                                    num_cols);
+}
+
+template <typename T>
+void ArgFlattenAssign(const phi::GPUContext& ctx,
+                      const DenseTensor* dO,
+                      const DenseTensor* indices,
+                      int64_t size,
+                      DenseTensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  dev_ctx.template Alloc<T>(in_grad);
+  if (out_grad.numel() == 0) return;
+  auto in_dims = in_grad->dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  int64_t size = in_grad->numel();
+
+  // Parallel acceleration when the input size is equal to the length of the
+  // ‘axis’ dimension.
+  // Compared to 'special case for full sort' below, the gradient calculation
+  // is 10 times faster.
+  if (size == in_dims[axis]) {
+    ArgFlattenAssign<T>(dev_ctx, &out_grad, &indices, size, in_grad);
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &out_grad, &indices, in_grad, input_height, input_width);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &trans_dO, &trans_ind, &tmp_out, input_height, input_width);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
new file mode 100644
index 00000000000..6a9c1e27599
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -0,0 +1,310 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+// Iter for move to next row
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (T j = row_id; j < num_rows; j += gridDim.x) {
+    for (T i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+// Sort by flag descending, True: descending. False: Ascending.
+// Default is false.
+template <typename T, typename IndType>
+void ArgFullSort(const phi::GPUContext& ctx,
+                 const DenseTensor* input,
+                 DenseTensor* output,
+                 DenseTensor* indices,
+                 const IndType num_rows,
+                 const IndType num_cols,
+                 const bool descending) {
+  auto cu_stream = ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<IndType> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  ctx.template Alloc<IndType>(&input_indices);
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  // Init a index array
+  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<IndType>(), num_rows, num_cols);
+
+  T* sorted_out_ptr;
+  IndType* sorted_indices_ptr;
+  const T* inp = input->data<T>();
+  T* out = ctx.template Alloc<T>(output);
+  IndType* ind = ctx.template Alloc<IndType>(indices);
+  sorted_out_ptr = out;
+  sorted_indices_ptr = ind;
+
+  // create iter for counting input
+  cub::CountingInputIterator<IndType> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<IndType,
+                              SegmentOffsetIter,
+                              cub::CountingInputIterator<IndType>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  gpuError_t err;
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr,
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+
+  DenseTensor temp_storage;
+  int64_t temp_size = temp_storage_bytes;
+  temp_storage.Resize({temp_size});
+  ctx.template Alloc<uint8_t>(&temp_storage);
+
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(),
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.data<uint8_t>(),
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  const T* in_data = input.data<T>();
+  auto size = input.numel();
+  T* out_data = dev_ctx.template Alloc<T>(output);
+  int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  // Compared to the following 'Special case for full sort', ascending sort is
+  // 34 times faster and descending sort is 31 times faster.
+  if (size == in_dims[axis]) {
+    thrust::sequence(thrust::device, ids_data, ids_data + size);
+    thrust::copy(thrust::device, in_data, in_data + size, out_data);
+    thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+    if (descending) {
+      thrust::reverse(thrust::device, out_data, out_data + size);
+      thrust::reverse(thrust::device, ids_data, ids_data + size);
+    }
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &input,
+                            output,
+                            indices,
+                            input_height,
+                            input_width,
+                            descending);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    T* trans_inp_data = dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    // temp indices for sorting
+    tmp_indices.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    dev_ctx.template Alloc<int64_t>(indices);
+
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &trans_inp,
+                            &tmp_out,
+                            &tmp_indices,
+                            input_height,
+                            input_width,
+                            descending);
+
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc
new file mode 100644
index 00000000000..62133a441ff
--- /dev/null
+++ b/paddle/phi/ops/compat/argsort_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ArgsortGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("argsort_grad",
+                         {"Indices", "X", GradVarName("Out")},
+                         {"axis", "descending"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping);
-- 
GitLab


From cf9291b9f10a7522bd5c9f0d4440b4f83f04b71c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 10 Mar 2022 11:09:11 +0800
Subject: [PATCH 231/272] mlir attr types for infrt place (2nd PR),
 test=develop (#40349)

---
 paddle/infrt/backends/host/phi_context.h      |   9 ++
 paddle/infrt/dialect/phi/CMakeLists.txt       |   3 +
 paddle/infrt/dialect/phi/data_type.cc         | 125 ++++++++++++++++++
 .../phi/data_type.h}                          |  24 +++-
 .../infrt/dialect/phi/ir/infrt_phi_tensor.td  |  28 ++--
 .../infrt/dialect/phi/pass/kernel_op_desc.cc  | 111 +---------------
 .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc |  12 +-
 .../host_context/mlir_to_runtime_translate.cc |  37 ++++++
 paddle/infrt/host_context/value.cc            |   8 --
 paddle/infrt/host_context/value.h             |  15 ++-
 paddle/infrt/kernel/phi/CMakeLists.txt        |   1 -
 paddle/infrt/kernel/phi/allocator_kernels.cc  |  25 ----
 paddle/infrt/kernel/phi/context_kernels.cc    |  11 +-
 paddle/infrt/kernel/phi/context_kernels.h     |   3 +-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  |  23 ++--
 .../infrt/kernel/phi/dense_tensor_kernels.h   |   9 +-
 .../infershaped_kernel_launcher.cc            |   3 +-
 paddle/infrt/kernel/phi/registry.cc           |  10 +-
 .../infrt/tests/dialect/phi/dense_tensor.mlir |   7 +-
 19 files changed, 249 insertions(+), 215 deletions(-)
 create mode 100644 paddle/infrt/dialect/phi/data_type.cc
 rename paddle/infrt/{kernel/phi/allocator_kernels.h => dialect/phi/data_type.h} (50%)
 delete mode 100644 paddle/infrt/kernel/phi/allocator_kernels.cc

diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 9d0e3bc4fbb..5713fdbbaf8 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace infrt {
@@ -20,6 +21,14 @@ class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
+
+  CpuPhiContext() {
+    Init();
+    SetAllocator(alloc_.get());
+  }
+
+ private:
+  std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index a2677a946cb..4e73a533d99 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -10,3 +10,6 @@ target_link_libraries(phi-ir-exec infrt)
 
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
+
+gather_srcs(infrt_src SRCS
+    data_type.cc)
diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc
new file mode 100644
index 00000000000..5da7ec88312
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/data_type.h"
+
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType cvtTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
+  switch (precision) {
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+#undef CONVERT_PRECISION_TO_PHI
+}
+
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
+  switch (datatype) {
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
+    default:
+      return PrecisionType::UNK;
+  }
+#undef CONVERT_PRECISION_FROM_PHI
+}
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey cvtPlace2Phi(const Place& place) {
+  return phi::KernelKey(cvtTarget2Phi(place.target),
+                        cvtLayout2Phi(place.layout),
+                        cvtPrecision2Phi(place.precision));
+}
+
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(cvtTargetFromPhi(tensor_arg.backend),
+               cvtPrecisionFromPhi(tensor_arg.dtype),
+               cvtLayoutFromPhi(tensor_arg.layout));
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.h b/paddle/infrt/dialect/phi/data_type.h
similarity index 50%
rename from paddle/infrt/kernel/phi/allocator_kernels.h
rename to paddle/infrt/dialect/phi/data_type.h
index d10382f5e60..b618ef38613 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.h
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -14,15 +14,25 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/phi_allocator.h"
-#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_factory.h"
 
 namespace infrt {
-namespace kernel {
-namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator();
+phi::Backend cvtTarget2Phi(TargetType target);
+TargetType cvtTargetFromPhi(phi::Backend backend);
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision);
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype);
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout);
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout);
+
+phi::KernelKey cvtPlace2Phi(const Place& place);
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg);
 
-}  // namespace phi
-}  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 3399c408d9b..21c4669b645 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -18,12 +18,13 @@ def PHI_DenseTensorDialect : Dialect {
 }
 
 // PHI DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
-}
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
+  mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp<string place, string dtype, string layout> 
-      : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
-  let arguments = (ins Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
+class CreateDenseTensorOp 
+      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+  let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
+    LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
 }
 
@@ -44,23 +45,16 @@ class PrintDenseTensorOp:
   let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
 
-class CreateCPUAllocatorOp
-      : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> {
+class CreateContextOp<string target>
+      : PDT_Op<"create_context." # target, [NoSideEffect]> {
   let arguments = (ins);
-  let results = (outs Allocator:$output);
-}
-
-class CreateCPUContextOp
-      : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins Allocator:$input);
   let results = (outs Context:$output);
 }
 
-def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">;
+def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
-def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
-def PDT_CreateContextOp_cpu : CreateCPUContextOp;
-def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
+def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_PrintDenseTensor : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
   let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 12a6cfcc3e4..d1763897b4a 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -14,119 +14,10 @@
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include <glog/logging.h>
-#include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/phi/kernels/declarations.h"
 
 namespace infrt {
-namespace {
-phi::Backend cvtTarget2Phi(TargetType target) {
-  switch (target) {
-    case TargetType::CPU:
-      return phi::Backend::CPU;
-    case TargetType::GPU:
-      return phi::Backend::GPU;
-    default:
-      return phi::Backend::UNDEFINED;
-  }
-}
-
-TargetType cvtTargetFromPhi(phi::Backend backend) {
-  switch (backend) {
-    case phi::Backend::CPU:
-      return TargetType::CPU;
-    case phi::Backend::GPU:
-      return TargetType::GPU;
-    default:
-      return TargetType::UNK;
-  }
-}
-
-phi::DataType cvtPrecision2Phi(PrecisionType precision) {
-#define CONVERT_PRECISION_TO_PHI(Precision) \
-  case PrecisionType::Precision:            \
-    return phi::DataType::Precision;
-
-  switch (precision) {
-    CONVERT_PRECISION_TO_PHI(FLOAT32)
-    CONVERT_PRECISION_TO_PHI(FLOAT16)
-    CONVERT_PRECISION_TO_PHI(FLOAT64)
-    CONVERT_PRECISION_TO_PHI(UINT8)
-    CONVERT_PRECISION_TO_PHI(INT8)
-    CONVERT_PRECISION_TO_PHI(INT16)
-    CONVERT_PRECISION_TO_PHI(INT32)
-    CONVERT_PRECISION_TO_PHI(INT64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX128)
-    CONVERT_PRECISION_TO_PHI(BOOL)
-    default:
-      return phi::DataType::UNDEFINED;
-  }
-#undef CONVERT_PRECISION_TO_PHI
-}
-
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
-#define CONVERT_PRECISION_FROM_PHI(Precision) \
-  case phi::DataType::Precision:              \
-    return PrecisionType::Precision;
-
-  switch (datatype) {
-    CONVERT_PRECISION_FROM_PHI(FLOAT32)
-    CONVERT_PRECISION_FROM_PHI(FLOAT16)
-    CONVERT_PRECISION_FROM_PHI(FLOAT64)
-    CONVERT_PRECISION_FROM_PHI(UINT8)
-    CONVERT_PRECISION_FROM_PHI(INT8)
-    CONVERT_PRECISION_FROM_PHI(INT16)
-    CONVERT_PRECISION_FROM_PHI(INT32)
-    CONVERT_PRECISION_FROM_PHI(INT64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
-    CONVERT_PRECISION_FROM_PHI(BOOL)
-    default:
-      return PrecisionType::UNK;
-  }
-#undef CONVERT_PRECISION_FROM_PHI
-}
-
-phi::DataLayout cvtLayout2Phi(LayoutType layout) {
-  switch (layout) {
-    case LayoutType::NCHW:
-      return phi::DataLayout::NCHW;
-    case LayoutType::NHWC:
-      return phi::DataLayout::NHWC;
-    case LayoutType::ANY:
-      return phi::DataLayout::ANY;
-    default:
-      return phi::DataLayout::UNDEFINED;
-  }
-}
-
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
-  switch (layout) {
-    case phi::DataLayout::NCHW:
-      return LayoutType::NCHW;
-    case phi::DataLayout::NHWC:
-      return LayoutType::NHWC;
-    case phi::DataLayout::ANY:
-      return LayoutType::ANY;
-    default:
-      return LayoutType::UNK;
-  }
-}
-
-phi::KernelKey cvtPlace2Phi(const Place& place) {
-  return phi::KernelKey(cvtTarget2Phi(place.target),
-                        cvtLayout2Phi(place.layout),
-                        cvtPrecision2Phi(place.precision));
-}
-
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
-  return Place(cvtTargetFromPhi(tensor_arg.backend),
-               cvtPrecisionFromPhi(tensor_arg.dtype),
-               cvtLayoutFromPhi(tensor_arg.layout));
-}
-
-}  // namespace
 
 std::string getPhiTargetPrefix(TargetType target) {
   switch (target) {
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
index 4347ec19e81..fb00a3de3fc 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -135,20 +135,12 @@ void phiOpCvtPass::diapatchStage() {
         phi_context.end()) {
       switch (phi_kernel_desc.kernelType.target) {
         case TargetType::CPU: {
-          auto alloctor_value =
-              builder
-                  .create<infrt::phi::CreateAllocatorOp_cpu>(
-                      kernel_op.getLoc(),
-                      phi::AllocatorType::get(kernel_op.getContext(),
-                                              TargetType::CPU))
-                  .output();
           auto context_value =
               builder
-                  .create<infrt::phi::CreateContextOp_cpu>(
+                  .create<infrt::phi::CreateCPUContextOp>(
                       kernel_op.getLoc(),
                       phi::ContextType::get(kernel_op.getContext(),
-                                            TargetType::CPU),
-                      alloctor_value)
+                                            TargetType::CPU))
                   .output();
           phi_context[TargetType::CPU] = context_value;
         } break;
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 17e6f7cb563..a901c323ec0 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -173,6 +173,36 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::TargetAttr>()) return boost::none;
+  if (attr.isa<::infrt::TargetAttr>()) {
+    return attr.cast<::infrt::TargetAttr>().getTarget();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::LayoutAttr>()) return boost::none;
+  if (attr.isa<::infrt::LayoutAttr>()) {
+    return attr.cast<::infrt::LayoutAttr>().getLayout();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none;
+  if (attr.isa<::infrt::PrecisionAttr>()) {
+    return attr.cast<::infrt::PrecisionAttr>().getPrecision();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -292,6 +322,13 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v =
+                   EmitAttribute<::infrt::PrecisionType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 3f404905572..abf0b8a9577 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,14 +24,6 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPhiContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::CPUContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::DenseTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::MetaTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
 
 const char* Value::type_info() const { return __type_info__; }
 
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 0ae482349cd..86df3508cf8 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -22,6 +22,7 @@
 
 #include "paddle/infrt/common/object.h"
 #include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
 #include "paddle/infrt/host_context/function.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
@@ -64,10 +65,12 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
+            ::infrt::PrecisionType,
+            ::infrt::LayoutType,
+            ::infrt::TargetType,
 #ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPhiAllocator,
             backends::CpuPhiContext,
             ::phi::CPUContext,
             std::vector<const phi::DenseTensor*>,
@@ -101,6 +104,9 @@ class Value : public common::Object {
   explicit Value(float x) : data(x) {}
   explicit Value(double x) : data(x) {}
   explicit Value(bool x) : data(x) {}
+  explicit Value(::infrt::TargetType x) : data(x) {}
+  explicit Value(::infrt::LayoutType x) : data(x) {}
+  explicit Value(::infrt::PrecisionType x) : data(x) {}
   explicit Value(std::string x) : data(x) {}
   explicit Value(tensor::TensorMap&& x) : data(x) {}
   explicit Value(std::vector<int16_t>&& x) : data(x) {}
@@ -112,11 +118,10 @@ class Value : public common::Object {
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
 #ifdef INFRT_WITH_PHI
-  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
 #endif
 
   template <typename T>
@@ -179,10 +184,6 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(float val);
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
-  explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPhiContext&& x);
-  explicit ValueRef(::phi::CPUContext&& x);
-  explicit ValueRef(::phi::DenseTensor&& x);
 
   using common::Shared<Value>::get;
   using common::Shared<Value>::Reset;
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index 7055c0c06d5..15882d23743 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -8,7 +8,6 @@ gather_srcs(infrt_src SRCS
     registry.cc
     dense_tensor_kernels.cc
     context_kernels.cc
-    allocator_kernels.cc
 )
 
 set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.cc b/paddle/infrt/kernel/phi/allocator_kernels.cc
deleted file mode 100644
index eba12e688b4..00000000000
--- a/paddle/infrt/kernel/phi/allocator_kernels.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
-
-namespace infrt {
-namespace kernel {
-namespace phi {
-
-backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
-
-}  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 3caaf1788e3..39ef172fade 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,12 +18,11 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(
-    infrt::backends::CpuPhiAllocator* allocator) {
-  ::phi::CPUContext context;
-  context.SetAllocator(allocator);
-  context.Init();
-  return context;
+::phi::CPUContext CreateCPUContext() {
+  ::phi::CPUContext ctx{};
+  ctx.Init();
+  ctx.SetAllocator(new backends::CpuPhiAllocator{});
+  return ctx;
 }
 
 }  // namespace phi
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 7f1e7ef6cd3..3e9580b91da 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -16,13 +16,14 @@
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*);
+::phi::CPUContext CreateCPUContext();
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 871336e8762..e89ee7cfe5d 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,20 +13,25 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
-#include <iostream>
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
+
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod) {
-  return ::phi::DenseTensor(allocator,
-                            ::phi::DenseTensorMeta(::phi::DataType::FLOAT32,
-                                                   ::phi::make_ddim(dims.get()),
-                                                   ::phi::DataLayout::NCHW,
-                                                   {}));
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(cvtPrecision2Phi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             cvtLayout2Phi(layout.get()),
+                             {}));
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 920c0b1c8af..187e5c64511 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -22,10 +23,12 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod);
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::PrecisionType> precision);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 165f7f7c943..75e3ebbf00c 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -24,7 +24,8 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
        frame->GetValues(1, frame->GetNumElements() - 1)) {
     // TODO(Superjomn) To extend this.
     if (value->is_type<::phi::DenseTensor>()) {
-      values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
+      values.emplace_back(new host_context::Value{
+          ::phi::MetaTensor{&value->get<::phi::DenseTensor>()}});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index cb09275c170..90570484179 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -19,7 +19,6 @@
 
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
@@ -33,13 +32,10 @@ namespace infrt {
 namespace kernel {
 
 void RegisterPhiKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("phi_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
   registry->AddKernel("phi_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
-  registry->AddKernel(
-      "phi_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
+  registry->AddKernel("phi_dt.create_dense_tensor",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor));
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
   registry->AddKernel("phi_dt.print_tensor",
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index b2e1cc52be6..e8f09f07c82 100644
--- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -2,9 +2,10 @@
 
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
-  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.allocator<CPU>
-  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator<CPU>) -> !phi.context<CPU>
-  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+  %t = "phi_dt.create_dense_tensor" (%ctx) {
+    precision=#infrt.precision<FP32>, 
+    layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
-- 
GitLab


From 5ae8513138b87a2b40f0c581b38cf690d549b59c Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Thu, 10 Mar 2022 11:14:25 +0800
Subject: [PATCH 232/272] [Phi] add the infer shape meta for the
 graph_send_recv (#40320)

* add the infer shape meta for the graph_send_recv

* move the infershape code to another file
---
 .../ir/mkldnn/mkldnn_inplace_pass_tester.cc   |  3 +-
 paddle/fluid/operators/graph_send_recv_op.cc  | 62 +++----------------
 paddle/phi/infermeta/ternary.cc               | 54 ++++++++++++++++
 paddle/phi/infermeta/ternary.h                |  6 ++
 4 files changed, 69 insertions(+), 56 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 796aa4039c9..d578ada0db0 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -15,8 +15,9 @@
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 
 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
 #include <unordered_set>
+
+#include <boost/logic/tribool.hpp>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index b759345eda5..f7c006dbcb1 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv");
-
-    auto src_index_dims = ctx->GetInputDim("Src_index");
-    if (src_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(src_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Src_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            src_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          src_index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The Src_index should be 1D, when it is not 2D, but we get %d",
-              src_index_dims.size()));
-    }
-
-    auto dst_index_dims = ctx->GetInputDim("Dst_index");
-    if (dst_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(dst_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Dst_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            dst_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dst_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The Dst_index should be 1D, "
-                                            "when it is not 2D, but we get %d",
-                                            dst_index_dims.size()));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        src_index_dims[0], dst_index_dims[0],
-        platform::errors::InvalidArgument(
-            "Src_index and Dst_index should have the same shape."));
-
-    auto dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
-                     "GraphSendRecv");
-      ctx->SetOutputDim("Dst_count", {dims[0]});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,10 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor,
+                            PD_INFER_META(phi::GraphSendRecvInferMeta));
 REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
                   ops::GraphSendRecvOpMaker,
                   ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
+                  GraphSendRecvInferShapeFunctor);
 REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 67a82392411..8baf3d7ed96 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -285,4 +285,58 @@ void LinspaceInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count) {
+  auto src_index_dims = src_index.dims();
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  auto dims = x.dims();
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+
+  if (pool_type == "MEAN") {
+    dst_count->set_dims({dims[0]});
+    dst_count->set_dtype(DataType::INT32);
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index da48641dee7..b54460bc9f6 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -71,4 +71,10 @@ void LinspaceInferMeta(const MetaTensor& start,
                        const MetaTensor& number,
                        MetaTensor* out);
 
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count);
 }  // namespace phi
-- 
GitLab


From 548f2be47edf09ec1eddd3d398674070e3bc186a Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Thu, 10 Mar 2022 11:22:49 +0800
Subject: [PATCH 233/272] [GPUPS]instag cuda kernel (#40377)

* update. test=develop

* fix. test=develop

* fix. test=develop
---
 paddle/fluid/operators/filter_by_instag_op.cu | 655 ++++++++++++++++++
 paddle/fluid/operators/filter_by_instag_op.h  |  23 +-
 2 files changed, 674 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/operators/filter_by_instag_op.cu

diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
new file mode 100644
index 00000000000..508730c3c73
--- /dev/null
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -0,0 +1,655 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
+
+#if defined(PADDLE_WITH_CUDA)
+#include <cooperative_groups.h>
+#endif
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <cstring>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/operators/filter_by_instag_op.h"
+
+#if defined(PADDLE_WITH_CUDA)
+namespace cg = cooperative_groups;
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+using Vector = framework::Vector<T>;
+
+#define WARP_SIZE 32
+#define MAX_WARP_NUM 32
+
+#if defined(PADDLE_WITH_CUDA)
+
+template <typename T>
+__global__ void filter_copy_fuse_kernel(
+    const size_t N, const int ins_per_thread, size_t* x1_lods_data,
+    size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data,
+    int64_t filter_tag_size, T* out_data, int64_t* map_data,
+    size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data,
+    const T* x1_data, int x1_embed_size, float* loss_weight_data,
+    float fill_value) {
+  // N is instance num
+  // one threads for ins_per_thread instances
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cg::thread_block b = cg::this_thread_block();
+  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+  int gid = idx / WARP_SIZE;
+
+  // general use
+  int thread_num =
+      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
+  int total_warp_num = thread_num / WARP_SIZE;      // 30
+  int remain_thread_num = thread_num % WARP_SIZE;   // 16
+
+  int warp_thread_num = -1;
+  if (gid < total_warp_num) {
+    warp_thread_num = WARP_SIZE;
+  } else {
+    warp_thread_num = remain_thread_num;
+  }
+
+  int group_num = total_warp_num;
+  if (remain_thread_num > 0) {
+    group_num = total_warp_num + 1;
+  }
+
+  if (gid >= group_num) return;
+
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (N < ins_end) ins_end = N;
+
+  /*
+    if (!x1_lods_filled) {
+      for (int p = ins_start; p < ins_end; p++) {
+        x1_lods_data[p] = p;
+      }
+      if (idx == 0) {
+        x1_lods_data[N] = N;
+      }
+    }
+
+    if (!x2_lods_filled) {
+      for (int p = ins_start; p < ins_end; p++) {
+        x2_lods_data[p] = p;
+      }
+      if (idx == 0) {
+        x2_lods_data[N] = N;
+      }
+    }
+
+    if (!x1_lods_filled || !x2_lods_filled) {
+      b.sync();
+    }
+  */
+
+  int flag_data[5];
+  int prefix_sum_data[5];
+  int prefix_sum_data2[5];
+
+  __shared__ int shr[MAX_WARP_NUM];
+  __shared__ int shr2[MAX_WARP_NUM];
+  __shared__ int shr3[MAX_WARP_NUM];
+
+  for (int p = ins_start; p < ins_end; p++) {
+    int ins_tag_start = x2_lods_data[p];
+    int ins_tag_end = x2_lods_data[p + 1];
+    flag_data[p - ins_start] = 0;
+    // filter logic
+    int i = ins_tag_start;
+    for (; i < ins_tag_end; i++) {
+      int64_t ins_tag = x2_data[i];
+      int j = 0;
+      for (; j < filter_tag_size; j++) {
+        if (x3_data[j] == ins_tag) break;
+      }
+      // if ins_tag in filter tag
+      if (j < filter_tag_size) {
+        flag_data[p - ins_start] = 1;
+        break;
+      }
+    }
+  }
+
+  int sum_addr = 0;
+  int sum_flag = 0;
+  int sum_out_lods = 0;
+
+  int local_addr = 0;
+  int local_flag = 0;
+  int local_out_lods = 0;
+
+  if (ins_start < ins_end) {
+    for (int p = ins_start; p < ins_end; p++) {
+      int previous = -1;
+      if (p == ins_start) {
+        previous = 0;
+      } else {
+        previous = prefix_sum_data[p - ins_start - 1];
+      }
+
+      prefix_sum_data[p - ins_start] =
+          previous +
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
+    sum_addr = local_addr;
+
+    // flag
+    // local_flag = 0;
+    for (int p = ins_start; p < ins_end; p++) {
+      local_flag += flag_data[p - ins_start];
+    }
+    sum_flag = local_flag;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_out_lods +=
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    sum_out_lods = local_out_lods;
+  }
+
+  // 32 threads
+  for (int i = 1; i < warp_thread_num; i *= 2) {
+    int temp_addr = g.shfl_up(sum_addr, i);
+    int temp_flag = g.shfl_up(sum_flag, i);
+    int temp_out_lods = g.shfl_up(sum_out_lods, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr += temp_addr;
+      sum_flag += temp_flag;
+      sum_out_lods += temp_out_lods;
+    }
+  }
+
+  if (g.thread_rank() == warp_thread_num - 1) {
+    shr[gid] = sum_addr;
+    shr2[gid] = sum_flag;
+    shr3[gid] = sum_out_lods;
+  }
+
+  b.sync();
+
+  int sum_addr2 = 0;
+  int sum_flag2 = 0;
+  int sum_out_lods2 = 0;
+
+  // communicate between warp
+  if (g.thread_rank() < group_num) {
+    sum_addr2 = shr[g.thread_rank()];
+    sum_flag2 = shr2[g.thread_rank()];
+    sum_out_lods2 = shr3[g.thread_rank()];
+  }
+
+  for (int i = 1; i < group_num; i *= 2) {
+    int temp_addr2 = g.shfl_up(sum_addr2, i);
+    int temp_flag2 = g.shfl_up(sum_flag2, i);
+    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr2 += temp_addr2;
+      sum_flag2 += temp_flag2;
+      sum_out_lods2 += temp_out_lods2;
+    }
+  }
+
+  int sum_addr3 = g.shfl(sum_addr2, gid);
+  int sum_flag3 = g.shfl(sum_flag2, gid);
+  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
+
+  int p_flag;
+  int p_addr;
+  int p_out_lods;
+
+  if (ins_start < ins_end) {
+    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
+    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
+    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      if (ins_start == p) {
+        prefix_sum_data2[p - ins_start] = p_addr;
+      } else {
+        prefix_sum_data2[p - ins_start] =
+            prefix_sum_data2[p - ins_start - 1] +
+            flag_data[p - ins_start - 1] *
+                (x1_lods_data[p] - x1_lods_data[p - 1]);
+      }
+    }
+
+    if (gid == 0 && g.thread_rank() == group_num - 1) {
+      *out_idx_data = (sum_flag2 + 1);
+      map_lods_data[sum_flag2] = sum_flag2;
+    }
+  }
+
+  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
+
+  if (ins_start < ins_end) {
+    int out_lods_idx = p_flag + 1;
+
+    // ins_start = 1
+    // BUG fix
+    for (int p = ins_start; p < ins_end; p++) {
+      if (flag_data[p - ins_start] == 1) {
+        // batch_len = 2
+        // batch_len = 4
+        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
+        // t = 0
+        // t = 1
+        int t = out_lods_idx - 1;
+        // out_lods_data[0] = 0;
+        int previous;
+
+        if (out_lods_idx == p_flag + 1) {
+          // out_lods_data[t] = p_out_lods;
+          previous = p_out_lods;
+        } else {
+          previous = out_lods_data[t];
+        }
+
+        map_data[t * 3] = (int64_t)previous;
+        map_data[t * 3 + 1] = x1_lods_data[p];
+        map_lods_data[t] = t;
+        out_lods_data[out_lods_idx] = previous + batch_len;
+        map_data[t * 3 + 2] = batch_len;
+        out_lods_idx++;
+      }
+    }
+
+    // fill loss_weight_data
+    if (sum_out_lods4 > 1) {
+      int out_data_num = sum_out_lods4 - 1;
+      int out_start = ins_start;
+
+      if (out_start < out_data_num) {
+        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
+        for (int p = out_start; p < out_end; p++) {
+          loss_weight_data[p] = fill_value;
+        }
+      }
+    }
+
+    for (int p = ins_start; p < ins_end; p++) {
+      // copy logic
+      if (flag_data[p - ins_start] == 1) {
+        auto output_start_idx = prefix_sum_data2[p - ins_start];
+        T* dst = out_data + output_start_idx * x1_embed_size;
+
+        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
+        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
+
+        // optimized
+        for (const T *j = src_start; j != src_end; dst++, j++) {
+          *dst = *j;
+        }
+      }
+    }
+  }
+
+  b.sync();
+}
+
+template <typename T>
+__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
+                                 const T* out_grad_data, T* x1_grad_data,
+                                 const int64_t* map_data, int x1_embed_size) {
+  // N is instance num
+  // one threads for one instance
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (ins_start >= N) {
+    return;
+  }
+  if (ins_end > N) ins_end = N;
+
+  for (int p = ins_start; p < ins_end; p++) {
+    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
+    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
+    const T* src_end =
+        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
+
+    for (const T *j = src_start; j != src_end; dst++, j++) {
+      *dst = *j;
+    }
+  }
+}
+
+#endif
+
+template <typename T>
+class FilterByInstagGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+
+    int max_thread_num_per_block = 1024;
+    //    context.cuda_device_context().GetMaxThreadsPerBlock();
+    // X1 is global FC output
+    // Dim [batch size, embedding size]
+    const LoDTensor* x1 = context.Input<LoDTensor>("Ins");
+    bool is_lod = context.Attr<bool>("is_lod");
+
+    int is_x1_lod = -1;
+    if (is_lod)
+      is_x1_lod = 1;
+    else
+      is_x1_lod = 0;
+
+    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
+    size_t x1_embed_size = x1->dims()[1];
+    // X2 is ins tag list
+    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
+    const LoDTensor* x2 = context.Input<LoDTensor>("Ins_tag");
+    // expected auto = const int64_t
+    const int64_t* x2_data = x2->data<int64_t>();
+
+    // X3 is local fc tag list
+    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
+    const Tensor* x3 = context.Input<Tensor>("Filter_tag");
+    const int64_t* x3_data = x3->data<int64_t>();
+
+    // int x2_lods_filled = 1;
+
+    Vector<size_t> x2_lods;
+    // Vector, in GPU
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+      // x2_lods_filled = 1;
+
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      // x2_lods.resize(x2->dims()[0] + 1);
+      // move to cuda
+      x2_lods.push_back(0);
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(i + 1);
+      }
+    }
+
+    const size_t x2_lods_size = x2_lods.size() - 1;
+    paddle::framework::MixVector<size_t> mixv_x2_lods(&x2_lods);
+
+    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
+
+    // Vector, in GPU
+    // int x1_lods_filled = 1;
+    Vector<size_t> x1_lods;
+
+    if (!is_x1_lod) {
+      // move to cuda
+      // x1_lods.resize(x1->dims()[0] + 1);
+      x1_lods.push_back(0);
+      for (int i = 0; i < x1->dims()[0]; i++) {
+        x1_lods.push_back(i + 1);
+      }
+    } else {
+      // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {  // lod_level = 1
+        // x1_lods_filled = 1;
+        x1_lods = x1->lod()[0];
+      } else {  // lod_level = 0
+        // x1_lods.resize(x1->dims()[0] + 1);
+        // move to cuda
+        x1_lods.push_back(0);
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
+    }
+
+    paddle::framework::MixVector<size_t> mixv_x1_lods(&x1_lods);
+
+    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
+    auto* x1_data = x1->data<T>();
+
+    // set output value
+    // for those whose ins been dropout, set 0 for whole lines.
+    // otherwise, copy whole line
+    // Dim [local fc count, batch size, embedding size]
+    LoDTensor* out = context.Output<LoDTensor>("Out");
+    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
+    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
+
+    int out_first = x1_lods.back();
+    // int out_first = x1->dims()[0];
+    // if (x1_lods_filled) {
+    //  out_first = x1_lods.back();
+    // }
+
+    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
+    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
+    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
+
+    T* out_data = out->mutable_data<T>(gpu_place);
+    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
+    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
+
+    int block_size = max_thread_num_per_block;
+    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
+    dim3 block_dim(block_size);
+    dim3 grid_dim(1);
+
+    Vector<size_t> out_lods(x2_lods_size + 1, 0);
+    Vector<size_t> map_lods(x2_lods_size + 1, 0);
+
+    paddle::framework::MixVector<size_t> mixv_out_lods(&out_lods);
+    paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+
+    // thrust::device_vector<size_t> out_idx(1);
+    Vector<size_t> out_idx(1, 0);
+    paddle::framework::MixVector<size_t> mixv_out_idx(&out_idx);
+
+    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
+    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
+    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
+
+    float fill_value = 1.0;
+
+    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+        x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data,
+        x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data,
+        out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value);
+
+    platform::GpuStreamSync(current_stream);
+
+    mixv_out_lods.resize(mixv_out_idx[0]);
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      out->Resize(phi::make_ddim(
+          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
+
+      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
+      loss_weight->Resize(
+          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
+
+    } else {
+      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
+      map->Resize(phi::make_ddim({1, 3}));
+      loss_weight->Resize(phi::make_ddim({1, 1}));
+    }
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      map_lods.resize(mixv_out_lods.size());
+
+      mixv_map_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+
+      map->set_lod(map_lod_info);
+      loss_weight->set_lod(map_lod_info);
+
+      mixv_out_lods.CopyToCPU();
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+    } else {
+      Vector<size_t> map_lods(2, 0);
+      paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+      thrust::device_ptr<int64_t> map_data_ptr(map_data);
+
+      map_data_ptr[0] = 0;
+      map_data_ptr[1] = 1;
+      map_data_ptr[2] = 1;
+
+      mixv_map_lods[0] = 0;
+      mixv_map_lods[1] = 1;
+      mixv_out_lods.push_back(1);
+
+      mixv_map_lods.CopyToCPU();
+      mixv_out_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+      map->set_lod(map_lod_info);
+
+      loss_weight->set_lod(map_lod_info);
+
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+      thrust::device_ptr<T> out_data_ptr(out_data);
+
+      // gpu kernel
+      if (std::is_same<T, int32_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int32_t>(out_val_if_empty));
+      } else if (std::is_same<T, int64_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int64_t>(out_val_if_empty));
+      } else if (std::is_same<T, float>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<float>(out_val_if_empty));
+      } else {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<double>(out_val_if_empty));
+      }
+
+      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
+      loss_weight_data_ptr[0] = 0;
+    }
+
+#endif
+  }
+};
+
+template <typename T>
+class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+    auto max_thread_num_per_block = 1024;
+    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
+    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
+    auto* mmap = context.Input<LoDTensor>("IndexMap");
+    auto* x1 = context.Input<LoDTensor>("Ins");
+
+    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
+    x1_grad->Resize(x1->dims());
+
+    auto* mmap_data = mmap->data<int64_t>();
+    // expected auto = T
+    auto* output_grad_data = output_grad->data<T>();
+    auto* loss_weight_data = loss_weight->data<float>();
+
+    // expected auto = T
+    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
+    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
+    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
+
+    thrust::fill(x1_grad_data_ptr,
+                 x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
+
+    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
+      auto output_dims = output_grad->dims();
+      int x1_embed_size = output_dims[1];
+
+      // one thread for multi-instances
+      int block_size = max_thread_num_per_block;
+
+      size_t N = mmap->dims()[0];
+      dim3 block_dim(block_size);
+
+      dim3 grid_dim((N + block_size - 1) / block_size);
+
+      const int ins_per_thread = 1;
+
+      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+          N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data,
+          x1_embed_size);
+
+      cudaStreamSynchronize(current_stream);
+    }
+
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel<float>,
+                        ops::FilterByInstagGPUKernel<double>,
+                        ops::FilterByInstagGPUKernel<int32_t>,
+                        ops::FilterByInstagGPUKernel<int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad,
+                        ops::FilterByInstagGradGPUKernel<float>,
+                        ops::FilterByInstagGradGPUKernel<double>,
+                        ops::FilterByInstagGradGPUKernel<int32_t>,
+                        ops::FilterByInstagGradGPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index deb2aa96b53..3abc980ceaa 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    size_t x2_lods_size = x2->dims()[0];
+    // size_t x2_lods_size = x2->dims()[0];
+    // size_t instag_num_per_ins = x2->dims()[1];
+
+    Vector<size_t> x2_lods(1, 0);
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_num_per_ins = x2->dims()[1];
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
+      }
+    }
+
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
@@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods_size; i++) {
-      for (size_t j = i; j < i + 1; j++) {
+    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
+      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];
@@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
           out_data[oi] = (int32_t)out_val_if_empty;
         } else if (std::is_same<T, int64_t>::value) {
           out_data[oi] = (int64_t)out_val_if_empty;
-        } else {
+        } else if (std::is_same<T, double>::value) {
           out_data[oi] = static_cast<double>(out_val_if_empty);
+        } else {
+          out_data[oi] = static_cast<float>(out_val_if_empty);
         }
       }
       loss_weight_data[0] = 0;
-- 
GitLab


From a07f19eedadc49570744a04bfd8a3492518328e7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 10 Mar 2022 13:27:28 +0800
Subject: [PATCH 234/272] [PHI] Move segment_pool to phi. (#40099)

* move segment_pool to phi.

* mark summed ids as optional tensor.

* fix as reviews.
---
 paddle/fluid/operators/CMakeLists.txt         |   2 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   1 -
 paddle/fluid/operators/segment_pool_op.cc     |  37 +--
 paddle/fluid/operators/segment_pool_op.cu     |  27 --
 paddle/fluid/operators/segment_pool_op.h      | 176 -----------
 paddle/fluid/operators/unity_build_rule.cmake |   4 +-
 paddle/phi/infermeta/binary.cc                |  19 ++
 paddle/phi/infermeta/binary.h                 |   8 +
 paddle/phi/kernels/CMakeLists.txt             |   4 +-
 .../kernels/cpu/segment_pool_grad_kernel.cc   |  26 ++
 paddle/phi/kernels/cpu/segment_pool_kernel.cc |  22 ++
 paddle/phi/kernels/funcs/CMakeLists.txt       |   1 +
 .../kernels/funcs}/segment_pooling.cc         |  84 ++---
 .../kernels/funcs}/segment_pooling.cu         | 289 +++++++++++-------
 .../kernels/funcs}/segment_pooling.h          |  35 ++-
 .../kernels/gpu/segment_pool_grad_kernel.cu   |  27 ++
 paddle/phi/kernels/gpu/segment_pool_kernel.cu |  23 ++
 .../impl/segment_pool_grad_kernel_impl.h      |  51 ++++
 .../kernels/impl/segment_pool_kernel_impl.h   | 142 +++++++++
 paddle/phi/kernels/segment_pool_grad_kernel.h |  31 ++
 paddle/phi/kernels/segment_pool_kernel.h      |  29 ++
 paddle/phi/ops/compat/segment_pool_sig.cc     |  33 ++
 22 files changed, 666 insertions(+), 405 deletions(-)
 delete mode 100644 paddle/fluid/operators/segment_pool_op.cu
 delete mode 100644 paddle/fluid/operators/segment_pool_op.h
 create mode 100644 paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/segment_pool_kernel.cc
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/segment_pooling.cc (65%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/segment_pooling.cu (54%)
 rename paddle/{fluid/operators/math => phi/kernels/funcs}/segment_pooling.h (51%)
 create mode 100644 paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/segment_pool_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/segment_pool_kernel_impl.h
 create mode 100644 paddle/phi/kernels/segment_pool_grad_kernel.h
 create mode 100644 paddle/phi/kernels/segment_pool_kernel.h
 create mode 100644 paddle/phi/ops/compat/segment_pool_sig.cc

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 91a0352e191..e77be832c0c 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling segment_pooling executor device_memory_aligment generator)
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d5a86d62b41..31a98d9f630 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -46,7 +46,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(segment_pooling)
 math_library(matrix_solve)
 
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 322cd97f01c..9d4c8532a82 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/segment_pool_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
-                   "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
-    auto dims = ctx->GetInputDim("X");
-    dims[0] = -1;
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
-                     "SegmentPool");
-      ctx->SetOutputDim("SummedIds", {-1, 1});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor,
+                            PD_INFER_META(phi::SegmentPoolInferMeta));
+
 REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
                   ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>,
+                  SegmentPoolInferShapeFunctor);
 REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
deleted file mode 100644
index e147e62a983..00000000000
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
deleted file mode 100644
index 2f5ef7f54f9..00000000000
--- a/paddle/fluid/operators/segment_pool_op.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename IndexT>
-void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
-  auto* input = context.Input<Tensor>("X");
-  auto* segment = context.Input<Tensor>("SegmentIds");
-  auto* output = context.Output<Tensor>("Out");
-  std::string pooltype = context.Attr<std::string>("pooltype");
-  Tensor* summed_ids = nullptr;
-
-  int64_t num_indices = segment->numel();
-  PADDLE_ENFORCE_EQ(
-      num_indices, input->dims()[0],
-      platform::errors::InvalidArgument(
-          "Segment_ids should be the same size as dimension 0 of input X."));
-  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
-                    platform::errors::InvalidArgument(
-                        "Segment_ids should be 1-D tensor, or it's other "
-                        "dimension size is 1. Segment_ids's shape is: [%s].",
-                        segment->dims()));
-
-  if (input->numel() == 0 || segment->numel() == 0) {
-    return;
-  }
-
-  bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU;
-  if (cpu_place) {
-    auto dims = input->dims();
-    auto* segment_ids = segment->data<IndexT>();
-    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
-    PADDLE_ENFORCE_GT(
-        dims[0], 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", dims[0]));
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, output, static_cast<T>(0));
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (!cpu_place) {
-    Tensor length;
-    length.mutable_data<IndexT>(phi::make_ddim({1}), platform::CPUPlace());
-    IndexT* length_data = length.data<IndexT>();
-    const IndexT* segment_ids = segment->data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                  hipMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                   cudaMemcpyDeviceToHost));
-#endif
-
-    IndexT length_host = length_data[0];
-    length_host++;
-    PADDLE_ENFORCE_GT(
-        length_host, 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", length_data[0]));
-    auto dims = input->dims();
-    dims[0] = static_cast<int64_t>(length_host);
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    T init_value = 0;
-    if (pooltype == "MAX") {
-      init_value = static_cast<T>(-FLT_MAX);
-    } else if (pooltype == "MIN") {
-      init_value = static_cast<T>(FLT_MAX);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> setconst;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    setconst(dev_ctx, output, static_cast<T>(init_value));
-    // the gpu kernel of mean pool record the counts of segment_ids
-    if (pooltype == "MEAN") {
-      summed_ids = context.Output<Tensor>("SummedIds");
-      summed_ids->Resize({dims[0], 1});
-      summed_ids->mutable_data<T>(context.GetPlace());
-      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
-    }
-  }
-#endif
-
-  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
-
-  pool(context.template device_context<DeviceContext>(), *input, *segment,
-       output, summed_ids, pooltype);
-}
-
-template <typename DeviceContext, typename T>
-class SegmentPoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SegmentPoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Input<Tensor>("Out");
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    const Tensor* summed_ids = nullptr;
-    if (pooltype == "MEAN") {
-      summed_ids = context.Input<Tensor>("SummedIds");
-    }
-
-    in_g->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 5ab20046178..1be8f3387db 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -236,7 +236,6 @@ register_unity_group(cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
-    segment_pool_op.cc
     select_input_op.cc
     select_output_op.cc)
 register_unity_group(cc
@@ -496,8 +495,7 @@ register_unity_group(cu
     scale_op.cu
     scatter_nd_add_op.cu
     scatter_op.cu
-    seed_op.cu
-    segment_pool_op.cu)
+    seed_op.cu)
 register_unity_group(cu
     roi_pool_op.cu
     selu_op.cu
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index b17405990fb..ff73829c475 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -417,6 +417,25 @@ void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   out->share_meta(x);
 }
 
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config) {
+  auto dims = x.dims();
+  dims[0] = -1;
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (pooltype == "MEAN") {
+    summed_ids->set_dims({-1, 1});
+    summed_ids->set_dtype(x.dtype());
+    summed_ids->set_layout(x.layout());
+  }
+}
+
 void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 934ed688bf2..bc5cb887f2a 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -80,6 +80,14 @@ void CrossInferMeta(const MetaTensor& x,
                     MetaTensor* out);
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config = MetaConfig());
+
 void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 71e0d9e3479..9b4b14bf51e 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
@@ -39,6 +39,8 @@ kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scat
 kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
new file mode 100644
index 00000000000..585c27bdcec
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
new file mode 100644
index 00000000000..d0413457f81
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index f0fbb7bf084..e0db7b51f8e 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(lapack)
 add_subdirectory(detail)
 
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(segment_pooling)
 math_library(sequence2batch)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
similarity index 65%
rename from paddle/fluid/operators/math/segment_pooling.cc
rename to paddle/phi/kernels/funcs/segment_pooling.cc
index d16fc570a9f..bf4a21f3722 100644
--- a/paddle/fluid/operators/math/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -12,45 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 
-namespace paddle {
-namespace operators {
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
+
+using Tensor = DenseTensor;
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* index,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = input.numel() / input.dims()[0];
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_t = output->Slice(curent_id, curent_id + 1);
       Tensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_e = framework::EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
-      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
 
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       if (pooltype == "MEAN") {
@@ -62,7 +69,7 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MIN") {
         out_e.device(place) = in_e.minimum(reduce_dim);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -75,36 +82,41 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* index = nullptr,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = in_grad->numel() / in_grad->dims()[0];
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
       Tensor in_g_t = in_grad->Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "MEAN") {
@@ -114,13 +126,13 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MAX" || pooltype == "MIN") {
         Tensor out_t = output.Slice(curent_id, curent_id + 1);
         Tensor in_t = input.Slice(last_idx, idx);
-        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
-        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        auto in_e = EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = EigenMatrix<T>::From(out_t, {1, w});
         in_g_e.device(place) =
             (in_e == out_e.broadcast(bcast)).template cast<T>() *
             out_g_e.broadcast(bcast);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -132,7 +144,7 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
   }
 };
 
-using CPU = platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
@@ -142,5 +154,5 @@ template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/segment_pooling.cu
rename to paddle/phi/kernels/funcs/segment_pooling.cu
index fbdcb99c02a..305cd39f077 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,20 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
 #include <algorithm>
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-using Tensor = framework::Tensor;
+using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+__global__ void SegmentSumIdsKernel(const Index* segment_ids,
+                                    T* summed_ids,
                                     const Index input_length_size,
                                     const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
@@ -45,16 +49,19 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "the segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
       if (current_segment_id > last_segment_id) {
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(summed_ids + interval_id) = 0;
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -64,13 +71,15 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
-                                  T* output, T* summed_ids,
+__global__ void SegmentMeanKernel(const Index* segment_ids,
+                                  const T* input,
+                                  T* output,
+                                  T* summed_ids,
                                   const Index input_length_size,
                                   const Index inner_dim_size,
                                   const Index output_length_size,
@@ -93,7 +102,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
 
@@ -102,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+            paddle::platform::CudaAtomicAdd(
+                output + output_index, sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -114,15 +124,14 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    platform::CudaAtomicAdd(output + output_index,
-                            sum / *(summed_ids + last_segment_id));
+    paddle::platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
   }
 }
 
 template <typename T, typename Index, typename Helper, typename Pool>
-__global__ void __launch_bounds__(1024, 1)
-    SegmentOpsKernel(const Index* segment_ids, const T* input, T* output,
-                     Helper h, Pool pool) {
+__global__ void __launch_bounds__(1024, 1) SegmentOpsKernel(
+    const Index* segment_ids, const T* input, T* output, Helper h, Pool pool) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
@@ -142,13 +151,16 @@ __global__ void __launch_bounds__(1024, 1)
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "The segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
 
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
@@ -175,9 +187,12 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <typename T, typename Index, typename Helper>
-__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
-                                       const T* output, const T* out_grad,
-                                       T* in_grad, Helper h) {
+__global__ void SegmentIndexGradKernel(const Index* segment_ids,
+                                       const T* input,
+                                       const T* output,
+                                       const T* out_grad,
+                                       T* in_grad,
+                                       Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
@@ -201,7 +216,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMax(address, val);
+    return paddle::platform::CudaAtomicMax(address, val);
   }
 };
 
@@ -211,7 +226,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMin(address, val);
+    return paddle::platform::CudaAtomicMin(address, val);
   }
 };
 
@@ -221,7 +236,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicAdd(address, val);
+    return paddle::platform::CudaAtomicAdd(address, val);
   }
 };
 
@@ -243,8 +258,10 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T* segment_offset,
-                               T* dim_index_base, T* actual_height) {
+  DEVICE inline void calculate(T stripe_index,
+                               T* segment_offset,
+                               T* dim_index_base,
+                               T* actual_height) {
     *segment_offset = stripe_index % inner_dim_size;
     *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
     *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
@@ -252,23 +269,32 @@ class ArrangeHelper {
 };
 
 template <typename T, typename Index>
-void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
-                                const framework::Tensor& input,
-                                const framework::Tensor& segment_ids,
-                                const framework::Tensor& output,
-                                const framework::Tensor& out_grad,
-                                framework::Tensor* in_grad,
+void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
+                                const DenseTensor& input,
+                                const DenseTensor& segment_ids,
+                                const DenseTensor& output,
+                                const DenseTensor& out_grad,
+                                DenseTensor* in_grad,
                                 const std::string pooltype = "SUM") {
-  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
-                                output.dims()[0]);
-  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  auto h = ArrangeHelper<Index>(
+      input.numel(), segment_ids.dims()[0], output.dims()[0]);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
-        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
-        out_grad.data<T>(), in_grad->data<T>(), h);
+    SegmentIndexGradKernel<T,
+                           Index,
+                           ArrangeHelper<Index>><<<config.block_per_grid.x,
+                                                   config.thread_per_block.x,
+                                                   0,
+                                                   ctx.stream()>>>(
+        segment_ids.data<Index>(),
+        input.data<T>(),
+        output.data<T>(),
+        out_grad.data<T>(),
+        in_grad->data<T>(),
+        h);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
         "available, but got %s.",
         pooltype));
@@ -291,13 +317,13 @@ __global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
 }
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segment_ids,
-                  framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segment_ids,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MEAN") {
       // Sum the segment id num first
@@ -305,50 +331,76 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
       auto input_length_size = segment_ids.numel();
       auto total_stripe_count =
           (input_length_size + DimTileSize - 1) / DimTileSize;
-      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                  config.thread_per_block.x,
+                                                  0,
+                                                  ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          summed_ids->data<T>(),
+          input_length_size,
           total_stripe_count);
     }
 
-    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
-                                   output->dims()[0]);
-    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    auto h = ArrangeHelper<IndexT>(
+        input.numel(), segment_ids.dims()[0], output->dims()[0]);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
-          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
-          h.output_length_size, h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                config.thread_per_block.x,
+                                                0,
+                                                ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          input.data<T>(),
+          output->data<T>(),
+          summed_ids->data<T>(),
+          h.input_length_size,
+          h.inner_dim_size,
+          h.output_length_size,
+          h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       SumPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MaxPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MinPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -357,33 +409,38 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
-      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
-                                            out_grad, in_grad, pooltype);
+      SegmentPoolCUDAGradFunctor<T, IndexT>(
+          dev_ctx, input, segments, output, out_grad, in_grad, pooltype);
     } else if (pooltype == "MEAN") {
-      framework::Tensor mean_grad;
-      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
-      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      DenseTensor mean_grad;
+      mean_grad.Resize(input.dims());
+      dev_ctx.template Alloc<T>(&mean_grad);
+      paddle::framework::TensorCopy(
+          out_grad, dev_ctx.GetPlace(), dev_ctx, &mean_grad);
       int len = output.dims()[0];
       int dim = output.numel() / len;
-      auto config = platform::GetGpuLaunchConfig1D(context, len);
-      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                     context.stream()>>>(mean_grad.data<T>(),
-                                         summed_ids->data<T>(), len, dim);
-      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len);
+      SimpleDiv<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(
+          mean_grad.data<T>(), summed_ids->data<T>(), len, dim);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, out_grad, segments, in_grad);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -391,15 +448,15 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
   }
 };
 
-using CUDA = paddle::platform::CUDADeviceContext;
-template class SegmentPoolFunctor<CUDA, float, int>;
-template class SegmentPoolFunctor<CUDA, float, int64_t>;
-template class SegmentPoolFunctor<CUDA, double, int>;
-template class SegmentPoolFunctor<CUDA, double, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, float, int>;
-template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, double, int>;
-template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle
+using GPU = phi::GPUContext;
+template class SegmentPoolFunctor<GPU, float, int>;
+template class SegmentPoolFunctor<GPU, float, int64_t>;
+template class SegmentPoolFunctor<GPU, double, int>;
+template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, float, int>;
+template class SegmentPoolGradFunctor<GPU, float, int64_t>;
+template class SegmentPoolGradFunctor<GPU, double, int>;
+template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
similarity index 51%
rename from paddle/fluid/operators/math/segment_pooling.h
rename to paddle/phi/kernels/funcs/segment_pooling.h
index 561fad6921f..b8281061582 100644
--- a/paddle/fluid/operators/math/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,33 +14,36 @@ limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM");
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolGradFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM");
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
new file mode 100644
index 00000000000..d9618dc159a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
new file mode 100644
index 00000000000..c38e935adf8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
new file mode 100644
index 00000000000..4ba1a0c6b6c
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto index_type = segment_ids.type();
+  if (index_type == DataType::INT32) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int64_t> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
new file mode 100644
index 00000000000..8a6df37ab3e
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& segment_ids,
+                               const std::string& pooltype,
+                               DenseTensor* out,
+                               DenseTensor* summed_ids) {
+  int64_t num_indices = segment_ids.numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices,
+      x.dims()[0],
+      phi::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices,
+                    segment_ids.dims()[0],
+                    phi::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment_ids.dims()));
+
+  if (x.numel() == 0 || segment_ids.numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU;
+  if (cpu_place) {
+    auto dims = x.dims();
+    auto* segment_ids_ptr = segment_ids.data<IndexT>();
+    dims[0] =
+        static_cast<int64_t>(segment_ids_ptr[segment_ids.numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, out, static_cast<T>(0));
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (!cpu_place) {
+    DenseTensor length;
+    length.Resize(phi::make_ddim({1}));
+    IndexT* length_data = dev_ctx.template HostAlloc<IndexT>(&length);
+
+    const IndexT* segment_ids_ptr = segment_ids.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(length_data,
+                                         segment_ids_ptr + num_indices - 1,
+                                         sizeof(IndexT),
+                                         hipMemcpyDeviceToHost));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          cudaMemcpyDeviceToHost));
+#endif
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host,
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = x.dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    phi::funcs::SetConstant<Context, T> setconst;
+    setconst(dev_ctx, out, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids->Resize({dims[0], 1});
+      dev_ctx.template Alloc<T>(summed_ids);
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
+
+  phi::funcs::SegmentPoolFunctor<Context, T, IndexT> pool;
+
+  pool(dev_ctx, x, segment_ids, out, summed_ids, pooltype);
+}
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids) {
+  auto index_type = segment_ids.dtype();
+  if (index_type == DataType::INT32) {
+    SegmentKernelLaunchHelper<Context, T, int>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else if (index_type == DataType::INT64) {
+    SegmentKernelLaunchHelper<Context, T, int64_t>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
new file mode 100644
index 00000000000..e773eed16e8
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_kernel.h b/paddle/phi/kernels/segment_pool_kernel.h
new file mode 100644
index 00000000000..8f7b30c2e86
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
new file mode 100644
index 00000000000..97646a2ac31
--- /dev/null
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SegmentPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "segment_pool_grad",
+      {
+          "X", "SegmentIds", "Out", "SummedIds", GradVarName("Out"),
+      },
+      {"pooltype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad,
+                           phi::SegmentPoolGradOpArgumentMapping);
-- 
GitLab


From 2b6da4deb1d1705f6d742ea7b09d2a2871795728 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 10 Mar 2022 13:37:24 +0800
Subject: [PATCH 235/272] Supported auto code gen for sparse kernels (#40276)

---
 .../final_state_generator/CMakeLists.txt      |   4 +-
 .../final_state_generator/eager_gen.py        | 321 +++++++++++-------
 .../final_state_generator/python_c_gen.py     | 140 +++++---
 python/paddle/utils/code_gen/sparse_api.yaml  |   8 +-
 .../paddle/utils/code_gen/sparse_api_gen.py   |   3 -
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |   2 +-
 .../utils/code_gen/sparse_bw_api_gen.py       |   3 -
 7 files changed, 286 insertions(+), 195 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index c6bca01205e..53af6c1048d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
+set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
+set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
 set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index f56cf8ef24c..4f6f437163a 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -23,6 +23,7 @@ core_ops_returns_info = {}
 core_ops_args_info = {}
 core_ops_args_type_info = {}
 
+namespace = ""
 
 yaml_types_mapping = {
     'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
@@ -125,6 +126,7 @@ def GetAutoGradMetaVectorName(string):
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
     contents = yaml.load(f, Loader=yaml.FullLoader)
+    f.close()
     return contents
 
 
@@ -133,9 +135,13 @@ def ReadBwdFile(filepath):
     contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
     for content in contents:
-        assert 'backward_api' in content.keys()
-        api_name = content['backward_api']
+        if 'backward_api' in content.keys():
+            api_name = content['backward_api']
+        else:
+            assert False
+
         ret[api_name] = content
+    f.close()
     return ret
 
 
@@ -608,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
+
+    if len(namespace) > 0:
+        grad_api_namespace = f"paddle::experimental::{namespace}"
+    else:
+        grad_api_namespace = f"paddle::experimental"
+
     FUNCTION_TEMPLATE = """
 std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
     // Call grad_api function
-    auto grad_api_returns = paddle::experimental::{}({});
+    auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
+        grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
+        returns_str)
 
     return node_definition_str
 
@@ -850,7 +863,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         function_name = fwd_api_name
     else:
         function_name = fwd_api_name + "_intermediate"
-    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
+
+    if len(namespace) > 0:
+        forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
+    else:
+        forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
     num_outputs = len(forward_outputs_position_map.keys()) - len(
@@ -1002,6 +1019,7 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1025,6 +1043,7 @@ def GenerateForwardCCFile(filepath, forward_definition_str):
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 
 """
@@ -1055,134 +1074,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    backward_yaml_path = args.backward_yaml_path
-
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-    grad_api_dict = ReadBwdFile(backward_yaml_path)
+    api_yaml_paths = args.api_yaml_path.split(",")
+    backward_yaml_paths = args.backward_yaml_path.split(",")
 
     # Generate per Dygraph API
     node_declaration_str = ""
     node_definition_str = ""
     forward_definition_str = ""
     forward_declaration_str = ""
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
 
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        no_need_buffer_set = set()
-        if 'no_need_buffer' in fwd_api.keys():
-            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        bwd_api_name = fwd_api['backward']
-        assert bwd_api_name in grad_api_dict.keys()
-        bwd_api = grad_api_dict[bwd_api_name]
-
-        assert 'args' in bwd_api.keys()
-        assert 'output' in bwd_api.keys()
-        assert 'forward' in bwd_api.keys()
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        bwd_forward_str = bwd_api['forward']
-        bwd_args_str = bwd_api['args']
-        bwd_returns_str = bwd_api['output']
-
-        # Collect Forward Inputs/Outputs
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
-            bwd_forward_str)
-        print("Parsed Forward Inputs List: ", forward_inputs_list)
-        print("Prased Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Forward Returns List: ", forward_returns_list)
-
-        intermediate_outputs = []
-        if 'intermediate' in fwd_api.keys():
-            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
-
-        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
-        print("Parsed Original Forward Returns List: ",
-              orig_forward_returns_list)
-
-        # Forward Validation Checks
-        ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
-                                forward_returns_list, orig_forward_inputs_list,
-                                orig_forward_attrs_list,
-                                orig_forward_returns_list)
-
-        # Parse Backward Inputs/Outputs
-        backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
-            bwd_args_str, bwd_returns_str)
-        print("Parsed Backward Inputs List: ", backward_inputs_list)
-        print("Prased Backward Attrs List: ", backward_attrs_list)
-        print("Parsed Backward Returns List: ", backward_returns_list)
-
-        # Determine Forward Inputs/Outputs Position
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        # SlotName Matching
-        backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
-            backward_inputs_list, backward_returns_list,
-            forward_inputs_position_map, forward_outputs_position_map)
-        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
-        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
-        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
-
-        # Backward Validation Check
-        BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
-                                backward_attrs_list)
-
-        # Node Declaration Generation
-        node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
-            no_need_buffer_set)
-        print("Generated Node Declaration: ", node_declaration_str)
-
-        node_definition_str += GenerateNodeDefinition(
-            fwd_api_name, bwd_api_name, backward_fwd_input_map,
-            backward_grad_input_map, backward_grad_output_map,
-            backward_attrs_list)
-        print("Generated Node Definition: ", node_definition_str)
-
-        # Node Definition Generation
-        definition_declaration_pair = GenerateForwardDefinition(
-            fwd_api_name, bwd_api_name, forward_inputs_position_map,
-            forward_outputs_position_map, forward_attrs_list,
-            backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list, optional_inputs,
-            intermediate_outputs)
-        print("Generated Forward Definition: ", forward_definition_str)
-        print("Generated Forward Declaration: ", forward_declaration_str)
-        forward_definition_str += definition_declaration_pair[0]
-        forward_declaration_str += definition_declaration_pair[1]
-
-        # For python-level API dispatch
-        CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
-                                  forward_outputs_position_map,
-                                  forward_attrs_list)
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+        backward_yaml_path = backward_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            assert "sparse" in backward_yaml_path
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+        grad_api_dict = ReadBwdFile(backward_yaml_path)
+
+        yaml_forward_definition_str = ""
+        yaml_forward_declaration_str = ""
+        yaml_node_declaration_str = ""
+        yaml_node_definition_str = ""
+        for fwd_api in fwd_api_list:
+            # We only generate Ops with grad
+            if 'backward' not in fwd_api.keys():
+                continue
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+            assert 'backward' in fwd_api.keys()
+
+            no_need_buffer_set = set()
+            if 'no_need_buffer' in fwd_api.keys():
+                no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
+                    'no_need_buffer'])
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            bwd_api_name = fwd_api['backward']
+            assert bwd_api_name in grad_api_dict.keys()
+            bwd_api = grad_api_dict[bwd_api_name]
+
+            assert 'args' in bwd_api.keys()
+            assert 'output' in bwd_api.keys()
+            assert 'forward' in bwd_api.keys()
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            bwd_forward_str = bwd_api['forward']
+            bwd_args_str = bwd_api['args']
+            bwd_returns_str = bwd_api['output']
+
+            # Collect Forward Inputs/Outputs
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
+                bwd_forward_str)
+            print("Parsed Forward Inputs List: ", forward_inputs_list)
+            print("Prased Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Forward Returns List: ", forward_returns_list)
+
+            intermediate_outputs = []
+            if 'intermediate' in fwd_api.keys():
+                intermediate_outputs = ParseIntermediate(fwd_api[
+                    'intermediate'])
+
+            IntermediateValidationCheck(intermediate_outputs,
+                                        forward_returns_list)
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ",
+                  orig_forward_inputs_list)
+            print("Prased Original Forward Attrs List: ",
+                  orig_forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  orig_forward_returns_list)
+
+            # Forward Validation Checks
+            ForwardsValidationCheck(
+                forward_inputs_list, forward_attrs_list, forward_returns_list,
+                orig_forward_inputs_list, orig_forward_attrs_list,
+                orig_forward_returns_list)
+
+            # Parse Backward Inputs/Outputs
+            backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
+                bwd_args_str, bwd_returns_str)
+            print("Parsed Backward Inputs List: ", backward_inputs_list)
+            print("Prased Backward Attrs List: ", backward_attrs_list)
+            print("Parsed Backward Returns List: ", backward_returns_list)
+
+            # Determine Forward Inputs/Outputs Position
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            # SlotName Matching
+            backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
+                backward_inputs_list, backward_returns_list,
+                forward_inputs_position_map, forward_outputs_position_map)
+            print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
+            print("Generated Backward Grad Input Map: ",
+                  backward_grad_input_map)
+            print("Generated Backward Grad Output Map: ",
+                  backward_grad_output_map)
+
+            # Backward Validation Check
+            BackwardValidationCheck(backward_fwd_input_map,
+                                    backward_grad_input_map,
+                                    backward_attrs_list)
+
+            # Node Declaration Generation
+            yaml_node_declaration_str += GenerateNodeDeclaration(
+                fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+                no_need_buffer_set)
+            print("Generated Node Declaration: ", node_declaration_str)
+
+            yaml_node_definition_str += GenerateNodeDefinition(
+                fwd_api_name, bwd_api_name, backward_fwd_input_map,
+                backward_grad_input_map, backward_grad_output_map,
+                backward_attrs_list)
+            print("Generated Node Definition: ", node_definition_str)
+
+            # Node Definition Generation
+            definition_declaration_pair = GenerateForwardDefinition(
+                fwd_api_name, bwd_api_name, forward_inputs_position_map,
+                forward_outputs_position_map, forward_attrs_list,
+                backward_fwd_input_map, backward_grad_input_map,
+                backward_grad_output_map, backward_attrs_list, optional_inputs,
+                intermediate_outputs)
+            print("Generated Forward Definition: ", forward_definition_str)
+            print("Generated Forward Declaration: ", forward_declaration_str)
+            yaml_forward_definition_str += definition_declaration_pair[0]
+            yaml_forward_declaration_str += definition_declaration_pair[1]
+
+            # For python-level API dispatch
+            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
+                                      forward_outputs_position_map,
+                                      forward_attrs_list)
+
+        if len(namespace) > 0:
+            forward_definition_str += f"""namespace {namespace} {{
+    {yaml_forward_definition_str}
+}}
+"""
+
+            forward_declaration_str += f"""namespace {namespace} {{
+    {yaml_forward_declaration_str}
+}}
+"""
+
+            node_declaration_str += f"""namespace {namespace} {{
+    {yaml_node_declaration_str}
+}}
+"""
+
+            node_definition_str += f"""namespace {namespace} {{
+    {yaml_node_definition_str}
+}}
+"""
+
+        else:
+            forward_definition_str += yaml_forward_definition_str
+            forward_declaration_str += yaml_forward_declaration_str
+            node_declaration_str += yaml_node_declaration_str
+            node_definition_str += yaml_node_definition_str
 
     # Generate Files
     nodes_h_path = args.nodes_h_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index d0506e45eb4..abf3f86bdb0 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
 skipped_fwd_api_names = set(["scale"])
 
@@ -126,16 +126,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
 }}
 
 """
+    namespace_str = ""
+    if len(namespace) > 0:
+        namespace_str = f"{namespace}::"
+
     if is_forward_only:
-        fwd_function_name = fwd_api_name
+        fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
     else:
-        fwd_function_name = GetForwardFunctionName(fwd_api_name)
+        fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
 
     python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
         fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
         fwd_function_name, dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
+    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
     return python_c_function_str, python_c_function_reg_str
 
@@ -189,7 +193,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
     """
 
     core_ops_infos_registry = """
-    ,{\"get_final_state_core_ops_args_info\",
+    {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
@@ -222,6 +226,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
 #include  "paddle/phi/common/data_type.h"
 #include  "paddle/phi/common/scalar.h"
 #include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
 #include  "paddle/fluid/pybind/op_function_common.h"
 #include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include  "paddle/fluid/pybind/exception.h"
@@ -254,57 +259,80 @@ def GeneratePythonCFile(filepath, python_c_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-
-    python_c_function_list = []
-    python_c_function_reg_list = []
-    for fwd_api in fwd_api_list:
-
-        # We only generate Ops with grad
-        is_forward_only = False
-        if 'backward' not in fwd_api.keys():
-            is_forward_only = True
-
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        if fwd_api_name in skipped_fwd_api_names:
-            continue
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Original Forward Returns List: ", forward_returns_list)
-
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-            fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map, optional_inputs, is_forward_only)
-        python_c_function_list.append(python_c_function_str)
-        python_c_function_reg_list.append(python_c_function_reg_str)
-        print("Generated Python-C Function: ", python_c_function_str)
-
-    python_c_functions_str = "\n".join(python_c_function_list)
-    python_c_functions_reg_str = ",\n".join(python_c_function_reg_list)
+    api_yaml_paths = args.api_yaml_path.split(",")
+
+    python_c_functions_reg_str = ""
+    python_c_functions_str = ""
+
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+
+        python_c_function_list = []
+        python_c_function_reg_list = []
+        for fwd_api in fwd_api_list:
+
+            # We only generate Ops with grad
+            is_forward_only = False
+            if 'backward' not in fwd_api.keys():
+                is_forward_only = True
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            if fwd_api_name in skipped_fwd_api_names:
+                continue
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ", forward_inputs_list)
+            print("Prased Original Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  forward_returns_list)
+
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
+                fwd_api_name, forward_inputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, optional_inputs, is_forward_only)
+            python_c_function_list.append(python_c_function_str)
+            python_c_function_reg_list.append(python_c_function_reg_str)
+            print("Generated Python-C Function: ", python_c_function_str)
+
+        # Append Namespace
+        python_c_functions_reg_str += ",\n".join(
+            python_c_function_reg_list) + ","
+        python_c_functions = "\n".join(python_c_function_list)
+        if len(namespace) > 0:
+            python_c_functions_str += f"""namespace {namespace} {{
+    {python_c_functions}
+}}
+"""
+
+        else:
+            python_c_functions_str += python_c_functions
 
     python_c_str = GeneratePythonCWrappers(python_c_functions_str,
                                            python_c_functions_reg_str)
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index b531c2ed9ce..2f233a2df35 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,21 +1,21 @@
-- sparse_api : conv3d
+- api : conv3d
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
   output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   kernel :
     func : sparse_conv3d
     layout : x
 
-- sparse_api : to_dense
+- api : to_dense
   args : (Tensor x, Backend backend)
   output : Tensor(out@DenseTensor)
   invoke : to_dense_impl(x, backend)
 
-- sparse_api : to_sparse_coo
+- api : to_sparse_coo
   args : (Tensor x, Backend backend, int64 sparse_dim)
   output : Tensor(out@SparseCooTensor)
   invoke : to_sparse_coo_impl(x, backend, sparse_dim)
 
-- sparse_api : to_sparse_csr
+- api : to_sparse_csr
   args : (Tensor x, Backend backend)
   output : Tensor(out@SparseCsrTensor)
   invoke : to_sparse_csr_impl(x, backend)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index 8ba090f8ca8..3838ac01c74 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -24,9 +24,6 @@ class SparseAPI(ForwardAPI):
     def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
-    def get_api_name(self, api_item_yaml):
-        return api_item_yaml['sparse_api']
-
     def get_api_func_name(self):
         return self.api
 
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index c71dce50299..8c9f02ebb31 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -1,4 +1,4 @@
-- sparse_bw_api : conv3d_grad
+- backward_api : conv3d_grad
   forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups)
   output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index ff87968f86d..ede4de2bdd6 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -25,9 +25,6 @@ class SparseBackwardAPI(SparseAPI, BackwardAPI):
     def __init__(self, bw_api_item_yaml):
         BackwardAPI.__init__(self, bw_api_item_yaml)
 
-    def get_api_name(self, api_item_yaml):
-        return api_item_yaml['sparse_bw_api']
-
     def get_api_func_name(self):
         return self.api
 
-- 
GitLab


From 329b095ee1f9baed065b1100bb0ce3959b5b1e15 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Thu, 10 Mar 2022 14:42:57 +0800
Subject: [PATCH 236/272] [Phi]Move topk kernel to phi (#40064)

* first commit

* cpu kernel

* first version

* fix compile error

* fix compile error

* delete v2

* fix

* fix

* add alias

* fix

* fix

* fix

* fix error

* fix

* fix

* fix

* fix format
---
 paddle/fluid/operators/kthvalue_op.cu        |   1 -
 paddle/fluid/operators/mode_op.cu            |   1 -
 paddle/fluid/operators/top_k_function_cuda.h |  13 +
 paddle/fluid/operators/top_k_v2_op.cc        |  15 +-
 paddle/fluid/operators/top_k_v2_op.cu        | 296 ----------------
 paddle/fluid/operators/top_k_v2_op.h         | 335 -------------------
 paddle/fluid/operators/top_k_v2_op_mlu.cc    |   2 +-
 paddle/fluid/operators/top_k_v2_op_npu.cc    |   2 +-
 paddle/fluid/operators/top_k_v2_op_xpu.cc    |   2 +-
 paddle/phi/core/compat/op_utils.h            |   4 +-
 paddle/phi/kernels/cpu/top_k_grad_kernel.cc  | 151 +++++++++
 paddle/phi/kernels/cpu/top_k_kernel.cc       | 230 +++++++++++++
 paddle/phi/kernels/funcs/math_function.h     |  38 +++
 paddle/phi/kernels/gpu/top_k_grad_kernel.cu  |  87 +++++
 paddle/phi/kernels/gpu/top_k_kernel.cu       | 264 +++++++++++++++
 paddle/phi/kernels/top_k_grad_kernel.h       |  32 ++
 paddle/phi/kernels/top_k_kernel.h            |  32 ++
 paddle/phi/ops/compat/top_k_sig.cc           |  42 +++
 18 files changed, 897 insertions(+), 650 deletions(-)
 delete mode 100644 paddle/fluid/operators/top_k_v2_op.cu
 delete mode 100644 paddle/fluid/operators/top_k_v2_op.h
 create mode 100644 paddle/phi/kernels/cpu/top_k_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/top_k_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/top_k_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/top_k_kernel.cu
 create mode 100644 paddle/phi/kernels/top_k_grad_kernel.h
 create mode 100644 paddle/phi/kernels/top_k_kernel.h
 create mode 100644 paddle/phi/ops/compat/top_k_sig.cc

diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
index 4f30c58d375..f6f56f70f1a 100644
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/kthvalue_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
index afb949d3374..2bacda8afb0 100644
--- a/paddle/fluid/operators/mode_op.cu
+++ b/paddle/fluid/operators/mode_op.cu
@@ -24,7 +24,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index d60976928e0..80c9935057c 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -51,6 +51,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
 struct SegmentOffsetIter {
   EIGEN_DEVICE_FUNC
   explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 810afc901df..d1add111e1d 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
-
-REGISTER_OP_CPU_KERNEL(top_k_v2,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
-
-REGISTER_OP_CPU_KERNEL(
-    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
deleted file mode 100644
index 84d8eef53bf..00000000000
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-template <typename DeviceContext, typename T>
-class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-
-    // get the attributes
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      Tensor k_host;
-      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
-      k = k_host.data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    const auto& out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      // if get the topK from the last axis
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
-                        indices, largest)) {
-          // Successed, return.
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      // NOTE: pass lds and dim same to input width.
-      // NOTE: old matrix implementation of stride is different to eigen.
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-    } else {
-      // if get topK not from the last axis, will tranpose the tensor and get
-      // TopK
-
-      // first step, prepare the trans args for the tranpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-      // second step, tranpose the input
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      // third step, calcluate the topk
-      // allocate the tmp cuda memory for the tmp result
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      Tensor trans_out;
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                        &trans_out, &trans_ind, largest)) {
-          // last step, tranpose back the indices and output
-          TransCompute<platform::CUDADeviceContext, int64_t>(
-              ndims, dev_ctx, trans_ind, indices, trans);
-          TransCompute<platform::CUDADeviceContext, T>(
-              ndims, dev_ctx, trans_out, output, trans);
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-    }
-  }
-};
-
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-template <typename DeviceContext, typename T>
-class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // get the real the axis and the k
-    if (axis < 0) axis += in_dims.size();
-    const int& k = out_dims[axis];
-    const int& raw_height = in_dims[axis];
-
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    auto ComputeBlockSize = [](int col) {
-      if (col > 512)
-        return 1024;
-      else if (col > 256 && col <= 512)
-        return 512;
-      else if (col > 128 && col <= 256)
-        return 256;
-      else if (col > 64 && col <= 128)
-        return 128;
-      else
-        return 64;
-    };
-    int block_size = ComputeBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-
-    // lanuch the cuda kernel to assign the grad
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          double>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
-                       paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int64_t>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
deleted file mode 100644
index a808207476f..00000000000
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-  The reason why we need the topk v2 is because the compatibility. We redefine
-  the NaN is maximum value
-  in the process of comparing. If do not add the topk v2,  will affect the
-  inference result of model that traing
-  by the older version paddlepaddle.
-*/
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopK(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     const int& k, const bool& largest, const bool& sorted) {
-  // when the k is small, will the partial sort
-  bool partial_sort_flag = (k * 64) < input_width;
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            if (largest) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            } else {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            }
-          });
-    } else {
-      // use the nth-element to get the K-larger or K-small element
-      if (largest) {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
-                    [&largest](const std::pair<T, Type>& l,
-                               const std::pair<T, Type>& r) {
-                      return (std::isnan(static_cast<double>(l.first)) &&
-                              !std::isnan(static_cast<double>(r.first))) ||
-                             (l.first > r.first);
-                    });
-        }
-      } else {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(
-              col_vec.begin(), col_vec.begin() + k - 1,
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-        }
-      }
-    }
-    for (Type j = 0; j < k; ++j) {
-      t_out[i * k + j] = col_vec[j].first;
-      t_indices[i * k + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopKAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data,
-                           const int& k) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TopkV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Get the top k elements of each row of input tensor
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto* indices = context.Output<Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    // if K tensor is not null, will the use K tesnor as k
-    auto* k_t = context.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      // accroding to axis to set K value in the dim
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    const auto& out_dims = output->dims();
-    if (axis + 1 == in_dims.size()) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           output_data, indices_data, k, largest, sorted);
-    } else {
-      // if the topk dims is not last dim, will tranpose and do topk
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      // Allocate the temp tensor to the save the topk indices, values
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      Tensor tmp_indices;
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      // get the TopK value
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, k, largest, sorted);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TopkV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    const size_t& k = out_dims[axis];
-
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis + 1 == in_dims.size()) {
-      // allocate the memory for the input_grad
-
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
-                     indices, x_grad_data, k);
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      // transpose the out_grad, indices
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, *indices, &trans_ind, trans);
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out, k);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 5b8a6b3e754..caaae02124c 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index e1107063883..dff5c2d3f39 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc
index 49daac2ff0d..4d9c39be92e 100644
--- a/paddle/fluid/operators/top_k_v2_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 1ab718c0794..fea79766a6b 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -52,7 +52,9 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "reshape_grad",
                                                            "expand",
                                                            "expand_grad",
-                                                           "sum"});
+                                                           "sum",
+                                                           "top_k",
+                                                           "top_k_grad"});
 
 class DefaultKernelSignatureMap {
  public:
diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
new file mode 100644
index 00000000000..582ee1157cc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  if (axis + 1 == in_dims.size()) {
+    // allocate the memory for the input_grad
+
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    FullTopKAssign(input_height,
+                   input_width,
+                   in_dims.size(),
+                   &out_grad,
+                   &indices,
+                   x_grad_data,
+                   k);
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    phi::DDim trans_dims(out_dims);
+    phi::DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    DenseTensor trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+
+    // Do transpose
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, out_grad, &trans_dO, trans);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, indices, &trans_ind, trans);
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    FullTopKAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out,
+                               k);
+
+    // Transpose back
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/top_k_kernel.cc b/paddle/phi/kernels/cpu/top_k_kernel.cc
new file mode 100644
index 00000000000..4ac16667ce2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_kernel.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     const int& k,
+                     const bool& largest,
+                     const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(),
+                    col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(),
+              col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // Get the top k elements of each row of input tensor
+  const auto& in_dims = input->dims();
+
+  // axis < 0, cacluate the real axis
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    auto out_dims = out->dims();
+    // accroding to axis to set K value in the dim
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  const auto& out_dims = out->dims();
+  if (axis + 1 == in_dims.size()) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         input,
+                         out_data,
+                         indices_data,
+                         k,
+                         largest,
+                         sorted);
+  } else {
+    // if the topk dims is not last dim, will tranpose and do topk
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    // get the trans input_dims, out_dims
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+
+    // transpose the input value
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, *input, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    // Allocate the temp tensor to the save the topk indices, values
+    DenseTensor tmp_out;
+    DenseTensor tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    tmp_indices.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    // get the TopK value
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         k,
+                         largest,
+                         sorted);
+    // transpose back
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    top_k, CPU, ALL_LAYOUT, phi::TopkKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 8e1a4cdd1a9..b735587d3d5 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -125,5 +125,43 @@ struct TensorSetConstantXPU {
 };
 #endif
 
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
new file mode 100644
index 00000000000..b0b45223489
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // get the real the axis and the k
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+  const int& raw_height = in_dims[axis];
+
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  ops::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+
+  // lanuch the cuda kernel to assign the grad
+  ops::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
new file mode 100644
index 00000000000..4e9aa88c6cb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -0,0 +1,264 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // get the input dims
+  const auto& in_dims = input->dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    phi::DDim out_dims = out->dims();
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  const auto& out_dims = out->dims();
+
+  const T* input_data = input->data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    // if get the topK from the last axis
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+
+    if (k > input_width) {
+      k = input_width;
+    }
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      if (ops::SortTopk<T>(
+              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
+              input,
+              input_width,
+              input_height,
+              k,
+              out,
+              indices,
+              largest)) {
+        // Successed, return.
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+  } else {
+    // if get topK not from the last axis, will tranpose the tensor and get
+    // TopK
+
+    // first step, prepare the trans args for the tranpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, *input, &trans_input, trans);
+    // third step, calcluate the topk
+    // allocate the tmp cuda memory for the tmp result
+    DenseTensor trans_ind;
+    DenseTensor trans_out;
+    trans_ind.Resize(trans_out_dims);
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    if (k > input_width) k = input_width;
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      if (ops::SortTopk<T>(
+              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
+              &trans_input,
+              input_width,
+              input_height,
+              k,
+              &trans_out,
+              &trans_ind,
+              largest)) {
+        // last step, tranpose back the indices and output
+        funcs::TransCompute<phi::GPUContext, int64_t>(
+            ndims, dev_ctx, trans_ind, indices, trans);
+        funcs::TransCompute<phi::GPUContext, T>(
+            ndims, dev_ctx, trans_out, out, trans);
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, out, trans);
+  }
+}
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/top_k_grad_kernel.h b/paddle/phi/kernels/top_k_grad_kernel.h
new file mode 100644
index 00000000000..f577b982c57
--- /dev/null
+++ b/paddle/phi/kernels/top_k_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_kernel.h b/paddle/phi/kernels/top_k_kernel.h
new file mode 100644
index 00000000000..fea76e448b5
--- /dev/null
+++ b/paddle/phi/kernels/top_k_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc
new file mode 100644
index 00000000000..9bf922b3d1b
--- /dev/null
+++ b/paddle/phi/ops/compat/top_k_sig.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("K")) {
+    return KernelSignature(
+        "top_k", {"X"}, {"K", "axis", "largest", "sorted"}, {"Out", "Indices"});
+
+  } else {
+    return KernelSignature(
+        "top_k", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"});
+  }
+}
+
+KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("top_k_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "largest", "sorted"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, top_k);
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, top_k_grad);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopkOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopkGradOpArgumentMapping);
-- 
GitLab


From df601667e781bb5e540b07bf14b202bd9d5418bf Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 10 Mar 2022 15:23:59 +0800
Subject: [PATCH 237/272] [NPU] Fix fill_constant and set_value for CANN 5.0.4
 (#39635)

* [NPU] Fix fill_constant int64 for CANN 5.0.4 (FillD does not support int64)

* [NPU] fix set_value for CANN 5.0.4
---
 paddle/fluid/operators/fill_constant_op_npu.cc | 2 +-
 paddle/fluid/operators/set_value_op_npu.cc     | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 79018f2a974..cb03add3143 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
-#if (CANN_VERSION_CODE >= 503003)
+#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001)
       runner.SetType("FillD")
           .AddInput(tensor_value)
           .AddOutput(*out_var)
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 599697059c4..46d64333b60 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(index_indices))
         .AddInput(val_temp)
         .AddOutput(out_temp)
+#if (CANN_VERSION_CODE >= 504001)
+        .AddAttrs({{"use_locking", false}})
+#endif
         .Run(stream);
   }
 };
-- 
GitLab


From 575dea8fe1c7b85d5f8d732145fcb8cbebdb8375 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Thu, 10 Mar 2022 16:24:49 +0800
Subject: [PATCH 238/272] [phi] move shape op (#40248)

* add selected row op and fix bug in ctest

* modify the date

* fix bug in npu and xpu

* modfiy the include file
---
 .../fluid/operators/mkldnn/shape_mkldnn_op.cc | 21 ++++--
 paddle/fluid/operators/shape_op.cc            |  8 ---
 paddle/fluid/operators/shape_op.cu            | 27 -------
 paddle/fluid/operators/shape_op.h             | 46 ------------
 paddle/fluid/operators/shape_op_npu.cc        |  2 +-
 paddle/fluid/operators/shape_op_xpu.cc        | 37 ++++++++--
 paddle/phi/kernels/cpu/shape_kernel.cc        | 33 +++++++++
 paddle/phi/kernels/gpu/shape_kernel.cu        | 35 ++++++++++
 paddle/phi/kernels/impl/shape_kernel_impl.h   | 36 ++++++++++
 .../phi/kernels/selected_rows/shape_kernel.cc | 70 +++++++++++++++++++
 .../phi/kernels/selected_rows/shape_kernel.h  | 28 ++++++++
 paddle/phi/kernels/shape_kernel.h             | 26 +++++++
 12 files changed, 279 insertions(+), 90 deletions(-)
 delete mode 100644 paddle/fluid/operators/shape_op.cu
 delete mode 100644 paddle/fluid/operators/shape_op.h
 create mode 100644 paddle/phi/kernels/cpu/shape_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/shape_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/shape_kernel_impl.h
 create mode 100644 paddle/phi/kernels/selected_rows/shape_kernel.cc
 create mode 100644 paddle/phi/kernels/selected_rows/shape_kernel.h
 create mode 100644 paddle/phi/kernels/shape_kernel.h

diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index 780c6e7f153..a3b764b0e1c 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -13,19 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/shape_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
 
 template <typename T>
-class ShapeMKLDNNKernel : public ShapeKernel<T> {
+class ShapeMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ShapeKernel<T>::Compute(ctx);
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
 
     auto* out = ctx.Output<Tensor>("Out");
     out->set_layout(framework::DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 5b7ccdde810..e2c8359beb1 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -95,9 +93,3 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int8_t>, ops::ShapeKernel<uint8_t>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>,
-                       ops::ShapeKernel<plat::complex<float>>,
-                       ops::ShapeKernel<plat::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
deleted file mode 100644
index c6e380a94f8..00000000000
--- a/paddle/fluid/operators/shape_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-#include "paddle/fluid/platform/complex.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<bool>,
-    paddle::operators::ShapeKernel<int>, paddle::operators::ShapeKernel<int8_t>,
-    paddle::operators::ShapeKernel<uint8_t>,
-    paddle::operators::ShapeKernel<int64_t>,
-    paddle::operators::ShapeKernel<float>,
-    paddle::operators::ShapeKernel<double>,
-    paddle::operators::ShapeKernel<paddle::platform::float16>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<float>>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
deleted file mode 100644
index 39ebcca46a7..00000000000
--- a/paddle/fluid/operators/shape_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-class ShapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<LoDTensor>().dims();
-    }
-    auto* out_t = ctx.Output<Tensor>("Out");
-    out_t->Resize({in_dims.size()});
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 7bff7b2d668..f751ab41014 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index 2e9092a6432..a62d1b434e7 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -10,12 +10,41 @@
  *     limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
 
-#include "paddle/fluid/operators/shape_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
+
+template <typename T>
+class ShapeXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel<bool>,
+                       ops::ShapeXPUKernel<int>, ops::ShapeXPUKernel<int64_t>,
+                       ops::ShapeXPUKernel<float>, ops::ShapeXPUKernel<double>);
 
 #endif
diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/shape_kernel.cc
new file mode 100644
index 00000000000..073dc63b2a4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shape_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shape_kernel.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/gpu/shape_kernel.cu
new file mode 100644
index 00000000000..39b6eaeaef2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shape_kernel.cu
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/shape_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+PD_REGISTER_KERNEL(shape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/kernels/impl/shape_kernel_impl.h
new file mode 100644
index 00000000000..982cfb33f6b
--- /dev/null
+++ b/paddle/phi/kernels/impl/shape_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
new file mode 100644
index 00000000000..9bcd5d8544e
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out) {
+  auto in_var = input;
+  phi::DDim in_dims;
+  in_dims = in_var.value().dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(shape_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.h b/paddle/phi/kernels/selected_rows/shape_kernel.h
new file mode 100644
index 00000000000..86ba52982b5
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/shape_kernel.h b/paddle/phi/kernels/shape_kernel.h
new file mode 100644
index 00000000000..444c481812e
--- /dev/null
+++ b/paddle/phi/kernels/shape_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out);
+
+}  // namespace phi
-- 
GitLab


From 2747de2b0a899109bd6234087df95633e8b01dff Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Thu, 10 Mar 2022 17:03:08 +0800
Subject: [PATCH 239/272] [Auto Parallel]Update reshard for while sub block
 (#40366)

* update reshard for while sub block

* fix code format error
---
 .../distributed/auto_parallel/reshard.py      | 686 +++++++++++++-----
 .../unittests/auto_parallel_autoconvert.py    |   6 +
 2 files changed, 511 insertions(+), 181 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 4cc710b226d..c6afcfec8a0 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -29,6 +29,7 @@ from .process_group import new_process_group, ProcessGroup, _g_process_group_map
 
 # NOTE: If op in _g_special_ops, it will not be resharded. 
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
+while_block_info = {}
 
 
 class AllGatherOpDesc:
@@ -280,8 +281,20 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(dist_tensor, dist_op, op_input=True):
+def _need_reshard(dist_tensor,
+                  dist_op,
+                  actual_process_mesh,
+                  program,
+                  dist_context,
+                  op_input=True):
     """Judge the tensor whether needs to be resharded."""
+
+    def _is_unshard(dims_mapping):
+        for dim in dims_mapping:
+            if dim != -1:
+                return False
+        return True
+
     is_reshard = False
     tensor_dist_attr = dist_tensor.dist_attr
     tensor_name = dist_tensor.serial_tensor.name
@@ -289,32 +302,74 @@ def _need_reshard(dist_tensor, dist_op, op_input=True):
     tensor_process_mesh = tensor_dist_attr.process_mesh
     op_dist_attr = dist_op.dist_attr
     op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-    op_process_mesh = op_dist_attr.process_mesh
+    op_process_mesh = actual_process_mesh
     if op_input:
         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_input_dims_mapping, op_process_mesh
                 ])):
-            if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
-                is_reshard = True
+            # dims_mapping
+            if tensor_dims_mapping != op_input_dims_mapping:
+                if dist_op.serial_op.type == "while":
+                    sub_block = program.blocks[dist_op.serial_op.attr(
+                        "sub_block").id]
+                    for op in sub_block.ops:
+                        for var_name in op.input_arg_names:
+                            if var_name == tensor_name:
+                                dist_op_attr = dist_context.get_dist_op_for_program(
+                                    op).dist_attr
+                                var_dims_mapping = dist_op_attr.get_input_dims_mapping(
+                                    var_name)
+                                if var_dims_mapping != tensor_dims_mapping:
+                                    is_reshard = True
+                                    break
+                else:
+                    is_reshard = True
+            # process_mesh
+            if tensor_process_mesh != op_process_mesh:
+                # when processes length is not the same, the dims mapping must be replicative now
+                if len(tensor_process_mesh.processes) != len(
+                        op_process_mesh.processes):
+                    assert _is_unshard(tensor_dims_mapping)
+                    assert _is_unshard(op_input_dims_mapping)
+                else:
+                    if dist_tensor.serial_tensor.dtype == paddle.bool:
+                        raise ValueError("Bool var is not supported reshard.")
+
+                    # for while op, it should find the process mesh of op actually used the tensor as input
+                    if dist_op.serial_op.type == "while":
+                        sub_block = program.blocks[dist_op.serial_op.attr(
+                            "sub_block").id]
+                        for op in sub_block.ops:
+                            for var_name in op.input_arg_names:
+                                if var_name == tensor_name:
+                                    dist_op_attr = dist_context.get_dist_op_for_program(
+                                        op).dist_attr
+                                    process_mesh = dist_op_attr.process_mesh
+                                    if process_mesh == op_process_mesh:
+                                        is_reshard = True
+                                        break
+                    else:
+                        is_reshard = True
     else:
         op_output_dims_mapping = op_dist_attr.get_output_dims_mapping(
             tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_output_dims_mapping, op_process_mesh
                 ])):
             if tensor_process_mesh != op_process_mesh:
+                if dist_tensor.serial_tensor.dtype == paddle.bool:
+                    raise ValueError("Bool var is not supported reshard.")
                 is_reshard = True
             if tensor_dims_mapping != op_output_dims_mapping:
                 raise ValueError(
                     "It is not supported that tensor dims mapping is different from op output dims mapping."
                 )
+
     return is_reshard
 
 
@@ -329,13 +384,14 @@ def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
     return complete_shape
 
 
-def find_op_desc_seq(dist_tensor, dist_op):
+def find_op_desc_seq(dist_tensor, dist_op, actual_process_mesh, batch_size):
     """
     Find the op description sequence to reshard the source tensor for matching the op requirement.
 
     Args:
         dist_tensor (DistributedTensor): A distributed tensor.
         dist_op (DistributedOperator): A distributed operator.
+        actual_process_mesh (ProcessMesh): The actual op process mesh.
 
     Returns:
         Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
@@ -350,11 +406,16 @@ def find_op_desc_seq(dist_tensor, dist_op):
     source_process_shape = source_process_mesh.topology
 
     op_dist_attr = dist_op.dist_attr
-    target_process_mesh = op_dist_attr.process_mesh
+    target_process_mesh = actual_process_mesh
     target_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
     target_process_group = target_process_mesh.processes
     target_process_shape = target_process_mesh.topology
 
+    if source_tensor.shape[0] < 0:
+        new_shape = list(source_tensor.shape)
+        new_shape[0] = batch_size
+        source_tensor.desc.set_shape(new_shape)
+
     complete_shape = _compute_complete_shape(
         source_tensor.shape, source_process_shape, source_dims_mapping)
     op_desc_seq = {}
@@ -503,7 +564,7 @@ def find_op_desc_seq(dist_tensor, dist_op):
     return op_desc_seq
 
 
-def _insert_send_op(block, idx, tensor, dst):
+def _insert_send_op(block, idx, tensor, dst, op_role):
     """Insert send op into block at the given index."""
     op_type = 'send_v2'
     block._insert_op(
@@ -514,10 +575,11 @@ def _insert_send_op(block, idx, tensor, dst):
             'ring_id': 0,
             'peer': dst,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_recv_op(block, idx, tensor, src):
+def _insert_recv_op(block, idx, tensor, src, op_role):
     """Insert recv op into block at the given index."""
     op_type = 'recv_v2'
     block._insert_op(
@@ -531,14 +593,16 @@ def _insert_recv_op(block, idx, tensor, src):
             'out_shape': tensor.shape,
             'dtype': tensor.dtype,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_concat_op(block, idx, tensors, axis):
+def _insert_concat_op(block, idx, tensors, axis, op_role):
     """Insert concat op into block at the given block."""
     inputs = {'X': tensors}
     attrs = {}
     attrs['axis'] = axis
+    attrs['op_role'] = op_role
     helper = LayerHelper('concat', **locals())
     with paddle.static.program_guard(block.program):
         out = helper.create_variable_for_type_inference(
@@ -548,7 +612,8 @@ def _insert_concat_op(block, idx, tensors, axis):
     return out
 
 
-def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
+def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
+                     op_role):
     """Insert slice op into block at the given block."""
     inputs = {'Input': tensor}
     infer_flags = list(1 for i in range(len(axes)))
@@ -556,24 +621,23 @@ def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
         "axes": axes,
         "starts": starts,
         "ends": ends,
-        "infer_flags": infer_flags
+        "infer_flags": infer_flags,
+        'op_role': op_role
     }
     helper = LayerHelper('slice', **locals())
     out = block.create_var(
-        name=new_var_name,
-        dtype=tensor.dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR)
+        name=new_var_name, dtype=tensor.dtype, type=tensor.type)
     block._insert_op(
         idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
     return out
 
 
-def _insert_split_op(block, idx, tensor, num_or_sections):
+def _insert_split_op(block, idx, tensor, num_or_sections, op_role):
     """Insert split op into block at the given index."""
     helper = LayerHelper('split', **locals())
     input_shape = tensor.shape
     inputs = {'X': tensor}
-    attrs = {'num': num_or_sections, "axis": 0}
+    attrs = {'num': num_or_sections, 'axis': 0, 'op_role': op_role}
     with paddle.static.program_guard(block.program):
         outs = [
             helper.create_variable_for_type_inference(
@@ -584,7 +648,7 @@ def _insert_split_op(block, idx, tensor, num_or_sections):
     return outs
 
 
-def _insert_allgather_op(block, idx, tensor, ranks):
+def _insert_allgather_op(block, idx, tensor, ranks, op_role):
     """Insert allgather op into block at the given index."""
 
     def _insert_fill_constant_op(block, idx):
@@ -597,6 +661,7 @@ def _insert_allgather_op(block, idx, tensor, ranks):
         attrs['str_value'] = str(int("1"))
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
+        attrs['op_role'] = op_role
         utils.get_shape_tensor_inputs(
             inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
         block._insert_op(
@@ -625,14 +690,16 @@ def _insert_allgather_op(block, idx, tensor, ranks):
             inputs={'X': [fill_constant_out]},
             outputs={'Out': [fill_constant_out]},
             attrs={'ring_id': 0,
-                   'use_calc_stream': True})
+                   'use_calc_stream': True,
+                   'op_role': op_role})
 
         # insert c_sync_calc_stream op
         block._insert_op(
             idx + 2,
             type="c_sync_calc_stream",
             inputs={'X': [fill_constant_out]},
-            outputs={'Out': [fill_constant_out]})
+            outputs={'Out': [fill_constant_out]},
+            attrs={'op_role': op_role})
         idx_offset = 3
 
     # insert c_allgather op
@@ -649,20 +716,21 @@ def _insert_allgather_op(block, idx, tensor, ranks):
         attrs={
             'ring_id': group.id,
             'use_calc_stream': True,
-            'nranks': group.nranks
+            'nranks': group.nranks,
+            'op_role': op_role
         })
     idx_offset += 1
 
     # insert split op
     split_out = _insert_split_op(block, idx + idx_offset, allgather_out,
-                                 group.nranks)
+                                 group.nranks, op_role)
     idx_offset += 1
     tensor_list.extend(split_out)
     return tensor_list, idx_offset
 
 
 def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
-                               block, idx):
+                               block, idx, op_role):
     """Concat the tensors and insert concat op."""
     if not partition_tensor_list:
         partition_tensor_list.append((tensor, partition_index))
@@ -674,13 +742,13 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
                 partition_tensor_list[i][1], partition_index)
             if concat_axis != -1:
                 has_concat = True
-                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \
+                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis, op_role) \
                     if first_order == 0 else \
-                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis)
+                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis, op_role)
                 partition_tensor_list.pop(i)
                 idx[0] += 1
                 _concat_partitions_with_op(partition_tensor_list, _,
-                                           new_partition, block, idx)
+                                           new_partition, block, idx, op_role)
                 break
             i += 1
         if not has_concat:
@@ -692,8 +760,47 @@ HAS_RECV = {}
 HAS_ALLGATHER = {}
 
 
-def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
-                  dist_context):
+def _get_while_op_actual_process_mesh(op, program, rank_id, dist_context):
+    """Get the while op actual Process mesh corresponding to rank"""
+    assert op.type == "while"
+    while_op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    actual_process_mesh = None
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh == while_op_process_mesh:
+            continue
+        if rank_id in process_mesh.processes:
+            raw_process_mesh = process_mesh
+            break
+
+    if actual_process_mesh is None and rank_id in while_op_process_mesh.processes:
+        actual_process_mesh = while_op_process_mesh
+
+    assert actual_process_mesh is not None
+    return actual_process_mesh
+
+
+def _get_var(var_name, block, program):
+    """Get var in the parent block if not found in the current block"""
+    var = None
+    if var_name in block.vars:
+        var = block.vars[var_name]
+    else:
+        parent_block = program.blocks[block.parent_idx]
+        if var_name in parent_block.vars:
+            var = parent_block.vars[var_name]
+    assert var is not None
+    return var
+
+
+def parse_op_desc(block, rank_id, op_desc_seq, var_name, reshard_op,
+                  dist_context, program, actual_process_mesh):
     """Parse op desc sequence and insert op in the block"""
     global HAS_SENT
     global HAS_RECV
@@ -703,9 +810,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
     if rank_id not in op_desc_seq.keys():
         return
     op_desc_list = op_desc_seq[rank_id]
-    block = program.global_block()
-    assert var_name in block.vars.keys(
-    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)
 
     idx = None
     for index, op in list(enumerate(block.ops)):
@@ -716,7 +820,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
         rank_id)
 
     matched_op = block.ops[idx]
-    source_tensor = block.vars[var_name]
+    source_tensor = _get_var(var_name, block, program)
     for op_desc in op_desc_list:
         if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
             if var_name not in HAS_ALLGATHER.keys():
@@ -724,7 +828,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
                     map(lambda x: x[0], HAS_ALLGATHER[var_name])):
                 tensor_list, idx_offset = _insert_allgather_op(
-                    block, idx, source_tensor, op_desc.group)
+                    block, idx, source_tensor, op_desc.group,
+                    reshard_op.attr('op_role'))
                 idx += idx_offset
                 tensor_name_list = [var.name for var in tensor_list]
                 HAS_ALLGATHER[var_name].append(
@@ -743,7 +848,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if var_name not in HAS_SENT.keys():
                 HAS_SENT[var_name] = []
             if op_desc.dst not in HAS_SENT[var_name]:
-                _insert_send_op(block, idx, source_tensor, op_desc.dst)
+                _insert_send_op(block, idx, source_tensor, op_desc.dst,
+                                reshard_op.attr('op_role'))
                 idx += 1
                 HAS_SENT[var_name].append(op_desc.dst)
 
@@ -758,8 +864,10 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 recv_tensor = block.create_var(
                     name=unique_name.generate(var_name + "@recv"),
                     shape=shape,
-                    dtype=source_tensor.dtype)
-                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
+                    dtype=source_tensor.dtype,
+                    type=source_tensor.type)
+                _insert_recv_op(block, idx, recv_tensor, op_desc.src,
+                                reshard_op.attr('op_role'))
                 tensor_list.append(recv_tensor)
                 idx += 1
                 HAS_RECV[var_name][op_desc.src] = recv_tensor
@@ -772,7 +880,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             for index, tensor in enumerate(tensor_list):
                 _concat_partitions_with_op(partition_tensor_list, tensor,
                                            partition_index_list[index], block,
-                                           idx_list)
+                                           idx_list, reshard_op.attr('op_role'))
             idx = idx_list[0]
 
         elif isinstance(op_desc, SliceOpDesc):
@@ -787,11 +895,11 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 starts=op_desc.starts,
                 ends=op_desc.ends,
                 axes=op_desc.axes,
-                new_var_name=new_name)
+                new_var_name=new_name,
+                op_role=reshard_op.attr('op_role'))
 
             tensor_attr = TensorDistributedAttribute()
-            process_mesh = dist_context.get_op_dist_attr_for_program(
-                matched_op).process_mesh
+            process_mesh = actual_process_mesh
             dims_mapping = dist_context.get_op_dist_attr_for_program(
                 matched_op).get_input_dims_mapping(var_name)
             tensor_attr.dims_mapping = dims_mapping
@@ -799,11 +907,29 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             dist_context.set_tensor_dist_attr_for_program(target_tensor,
                                                           tensor_attr)
 
+            if op.type == "while":
+                global while_block_info
+                # var_reshard_mapping means the while op input need be changed to 
+                if "var_reshard_mapping" not in while_block_info[op.attr(
+                        "sub_block").id].keys():
+                    while_block_info[op.attr("sub_block").id][
+                        "var_reshard_mapping"] = {}
+                while_block_info[op.attr("sub_block").id][
+                    "var_reshard_mapping"][var_name] = target_tensor.name
+
             # rename op input name according to new name
             for op in block.ops:
                 for name in op.input_arg_names:
                     op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                     if name == var_name and op_dist_attr is not None:
+                        if op.desc.id() == matched_op.desc.id():
+                            op.desc._rename_input(name, target_tensor.name)
+                            op_dist_attr.set_input_dims_mapping(
+                                target_tensor.name, dims_mapping)
+                            op_dist_attr.set_input_dist_attr(name, None)
+                            continue
+
+                        # NOTE: For op whose process mesh is a union, its input will not be renamed by other op reshard result now which means that it will have more reshard operation.
                         op_process_mesh = op_dist_attr.process_mesh
                         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                             var_name)
@@ -819,102 +945,166 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
     not_remove_op_ref = [
         "create_py_reader", "create_double_buffer_reader", "read"
     ]
-    remove_op_idx = []
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    for idx, op in enumerate(ops):
-        # handle read op in the pipeline scene specially, it will be removed in the future.
-        if op.type == "read":
-            dim_list = []
-            for var_name in op.output_arg_names:
-                dim_list.extend(vars[var_name].shape)
-            for i in range(idx, -1, -1):
-                if ops[i].type == "create_py_reader":
-                    ops[i]._set_attr("shape_concat", dim_list)
-                    break
-            continue
-
-        # replace the input and output of c_sync_comm_stream op when in pipeline scene.
-        if op.type == "c_sync_comm_stream":
-            need_save = []
-            for var_name in op.input_arg_names:
-                process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                    vars[var_name]).process_mesh
-                if rank_id in process_mesh.processes:
-                    need_save.append(var_name)
-            if not need_save:
-                remove_op_idx.append(idx)
+    global while_block_info
+
+    # NOTE: The nested sub block is not be supported now.
+    remove_block_order = []
+    for block_idx in while_block_info:
+        remove_block_order.append(block_idx)
+
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx not in remove_block_order:
+            remove_block_order.append(block_idx)
+
+    # the sub block should be removed first
+    for block_idx in remove_block_order:
+        remove_op_idx = []
+        block = auto_parallel_main_prog.blocks[block_idx]
+        ops = block.ops
+        vars = block.vars
+        for idx, op in enumerate(ops):
+            if op.type == "read":
+                dim_list = []
+                for var_name in op.output_arg_names:
+                    dim_list.extend(
+                        _get_var(var_name, block, auto_parallel_main_prog)
+                        .shape)
+                for i in range(idx, -1, -1):
+                    if ops[i].type == "create_py_reader":
+                        ops[i]._set_attr("shape_concat", dim_list)
+                        break
                 continue
 
-            proto = OpProtoHolder.instance().get_op_proto(op.type)
-            op.desc.set_input(proto.inputs[0].name, need_save)
-            op.desc.set_output(proto.outputs[0].name, need_save)
-            continue
+            # replace the input and output of c_sync_comm_stream op when in pipeline scene.
+            if op.type == "c_sync_comm_stream":
+                need_save = []
+                for var_name in op.input_arg_names:
+                    process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                        _get_var(var_name, block,
+                                 auto_parallel_main_prog)).process_mesh
+                    if rank_id in process_mesh.processes:
+                        need_save.append(var_name)
+                if not need_save:
+                    remove_op_idx.append(idx)
+                    continue
 
-        # judge the other op whether should be removed.
-        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-        if op_dist_attr is not None:
-            op_process_mesh = op_dist_attr.process_mesh
-            if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
-                remove_op_idx.append(idx)
+                proto = OpProtoHolder.instance().get_op_proto(op.type)
+                op.desc.set_input(proto.inputs[0].name, need_save)
+                op.desc.set_output(proto.outputs[0].name, need_save)
+                continue
 
-    for idx in remove_op_idx[::-1]:
-        block._remove_op(idx)
+            # judge the other op whether should be removed.
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            if op_dist_attr is not None:
+                op_process_mesh = op_dist_attr.process_mesh
+                if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
+                    remove_op_idx.append(idx)
+
+        for idx in remove_op_idx[::-1]:
+            block._remove_op(idx)
 
 
 def _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads):
     """Remove no need vars in the main program"""
-    remove_vars = set()
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    need_vars = set()
-    for op in ops:
-        for var_name in op.input_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-        for var_name in op.output_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-    for var in vars:
-        if var not in need_vars:
-            remove_vars.add(var)
-
-    # change dist_params_grads
-    param_grad_map = {}
-    for op in ops:
-        if int(op.attr('op_role')) == int(OpRole.Optimize):
-            if "Param" in op.input_names and "Grad" in op.input_names:
-                param_name = op.input("Param")[0]
-                grad_name = op.input("Grad")[0]
-                param_grad_map[param_name] = grad_name
-
-    need_remove_idx = []
-    for idx, item in enumerate(dist_params_grads):
-        if item[0].name not in param_grad_map.keys():
-            need_remove_idx.append(idx)
-
-    for idx in need_remove_idx[::-1]:
-        dist_params_grads.pop(idx)
-
-    idx = 0
-    while idx < len(dist_params_grads):
-        param_name = dist_params_grads[idx][0].name
-        grad_name = dist_params_grads[idx][1].name
-        if grad_name != param_grad_map[param_name]:
-            dist_params_grads[idx] = (vars[param_name],
-                                      vars[param_grad_map[param_name]])
-        idx += 1
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        remove_vars = set()
+        ops = block.ops
+        vars = block.vars
+        need_vars = set()
+        for op in ops:
+            for var_name in op.input_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+            for var_name in op.output_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+        for var in vars:
+            if var not in need_vars:
+                remove_vars.add(var)
+
+        # change dist_params_grads, the optimize op just in block 0.
+        if block_idx == 0:
+            param_grad_map = {}
+            for op in ops:
+                if int(op.attr('op_role')) == int(OpRole.Optimize):
+                    if "Param" in op.input_names and "Grad" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        grad_name = op.input("Grad")[0]
+                        param_grad_map[param_name] = grad_name
+
+            need_remove_idx = []
+            for idx, item in enumerate(dist_params_grads):
+                if item[0].name not in param_grad_map.keys():
+                    need_remove_idx.append(idx)
+
+            for idx in need_remove_idx[::-1]:
+                dist_params_grads.pop(idx)
+
+            idx = 0
+            while idx < len(dist_params_grads):
+                param_name = dist_params_grads[idx][0].name
+                grad_name = dist_params_grads[idx][1].name
+                if grad_name != param_grad_map[param_name]:
+                    dist_params_grads[idx] = (vars[param_name],
+                                              vars[param_grad_map[param_name]])
+                idx += 1
 
-    for var in remove_vars:
-        block._remove_var(var)
+        for var in remove_vars:
+            block._remove_var(var)
+
+
+def _change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
+    """Change while op input and output after the corresponding sub block ops removed"""
+    global while_block_info
+    for sub_block_idx in while_block_info:
+        sub_block = auto_parallel_main_prog.blocks[sub_block_idx]
+        parent_while_op_id = while_block_info[sub_block_idx]["op_id"]
+        parent_block = auto_parallel_main_prog.blocks[sub_block.parent_idx]
+
+        sub_block_op_inputs = set()
+        sub_block_op_outputs = []
+        for op in sub_block.ops:
+            # skip the input and output of operators inserted in the reshard phase
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op:
+                for var_name in op.output_arg_names:
+                    if var_name not in sub_block_op_outputs:
+                        sub_block_op_outputs.append(var_name)
+                for var_name in op.input_arg_names:
+                    sub_block_op_inputs.add(var_name)
+
+        # find the while op
+        while_op = None
+        for op in parent_block.ops:
+            if op.desc.id() == parent_while_op_id and op.type == "while":
+                while_op = op
+                break
+
+        assert while_op is not None
+
+        # find the actual input and output of while op
+        proto = OpProtoHolder.instance().get_op_proto(while_op.type)
+        new_X = []
+        for var_name in while_op.input("X"):
+            if var_name in sub_block_op_inputs:
+                new_X.append(var_name)
+        assert new_X
+        while_op.desc.set_input(proto.inputs[0].name, new_X)
+
+        new_Out = []
+        for var_name in while_op.output("Out"):
+            for output_name in sub_block_op_outputs[::-1]:
+                if output_name.find(var_name) != -1:
+                    new_Out.append(output_name)
+        assert new_Out
+        while_op.desc.set_output(proto.outputs[0].name, new_Out)
 
 
 def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
                            dist_params_grads):
     """Remove no need vars and ops in the main program."""
     _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id)
+    _change_while_op_input_and_output(auto_parallel_main_prog, dist_context)
     _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads)
 
 
@@ -992,8 +1182,70 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
         startup_block._remove_op(idx)
 
 
-def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
-            dist_context, dist_params_grads):
+def _get_process_meshes(op, program, dist_context):
+    """Get all process meshes when op has sub block."""
+    assert op.has_attr("sub_block")
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    process_meshes = []
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh not in process_meshes and process_mesh != op_process_mesh:
+            process_meshes.append(process_mesh)
+
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def _is_condition_replicative(op, program, dist_context):
+    assert op.type == "while"
+    sub_block = program.blocks[op.attr("sub_block").id]
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_dist_attr = dist_op.dist_attr
+
+    # the dims mapping of condition tensor should be replicative
+    for var_name in op.input("Condition"):
+        var = _get_var(var_name, sub_block, program)
+        dist_tensor = dist_context.get_dist_tensor_for_program(var)
+        tensor_dist_attr = dist_tensor.dist_attr
+        var_dims_mapping = tensor_dist_attr.dims_mapping
+        for dim in var_dims_mapping:
+            if dim != -1:
+                return False
+
+    return True
+
+
+def _get_op_process_meshes(op, dist_context):
+    process_meshes = []
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_process_mesh = dist_op.dist_attr.process_mesh
+    for process_mesh in dist_context.process_meshes:
+        if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+        ) and len(process_mesh.processes) <= len(op_process_mesh.processes):
+            process_meshes.append(process_mesh)
+
+    # it means the process mesh is not a union when process meshes is null
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def reshard(auto_parallel_main_prog,
+            auto_parallel_startup_prog,
+            rank_id,
+            dist_context,
+            dist_params_grads,
+            batch_size=None):
     """
     Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute.
 
@@ -1019,65 +1271,137 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
             return True
         return False
 
-    block = auto_parallel_main_prog.global_block()
-    idx = 0
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
+    global while_block_info
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx in while_block_info:
+            if "var_reshard_mapping" in while_block_info[block_idx]:
+                var_reshard_mapping = while_block_info[block_idx][
+                    "var_reshard_mapping"]
+                for op in block.ops:
+                    for var_name in op.input_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_input(var_name,
+                                                  var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_input_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_input_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_input_dist_attr(var_name, None)
+
+                    # the outputs also need to be renamed when the output name is the same with input name
+                    for var_name in op.output_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_output(
+                                var_name, var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_output_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_output_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_output_dist_attr(var_name,
+                                                                  None)
+
+        idx = 0
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+
+            if _is_special_op(op):
+                idx += 1
+                continue
 
-        if _is_special_op(op):
-            idx += 1
-            continue
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None:
+                process_meshes = []
+                if op.type == "while":
+                    if not _is_condition_replicative(
+                            op, auto_parallel_main_prog, dist_context):
+                        raise ValueError(
+                            "Please check the condition due to the dims mapping is not replicative."
+                        )
+                    process_meshes = _get_process_meshes(
+                        op, auto_parallel_main_prog, dist_context)
+                    assert process_meshes
+                    if op.attr("sub_block").id not in while_block_info:
+                        while_block_info[op.attr("sub_block").id] = {}
+                    while_block_info[op.attr("sub_block").id][
+                        "op_id"] = op.desc.id()
+                    while_block_info[op.attr("sub_block").id][
+                        "actual_process_mesh"] = _get_while_op_actual_process_mesh(
+                            op, auto_parallel_main_prog, rank_id, dist_context)
+                else:
+                    process_meshes = _get_op_process_meshes(op, dist_context)
+                input_vars = None
+                if op.type == "while":
+                    input_var_names = op.input("X")
+                else:
+                    input_var_names = op.input_arg_names
+                idx_offset = 0
+                for var_name in op.input_arg_names:
+                    # skip lod_tensor_blocking_queue_0
+                    if var_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    for process_mesh in process_meshes:
+                        if dist_tensor is not None and _need_reshard(
+                                dist_tensor, dist_op, process_mesh,
+                                auto_parallel_main_prog, dist_context):
+                            reshard_op_desc = find_op_desc_seq(
+                                dist_tensor, dist_op, process_mesh, batch_size)
+                            parse_op_desc(block, rank_id, reshard_op_desc,
+                                          var_name, op, dist_context,
+                                          auto_parallel_main_prog, process_mesh)
+                            cur_op_count = len(block.ops)
+                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None:
-            idx_offset = 0
-            for var_name in op.input_arg_names:
-                # skip lod_tensor_blocking_queue_0
-                if var_name == "lod_tensor_blocking_queue_0":
-                    continue
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op):
-                    reshard_op_desc = find_op_desc_seq(dist_tensor, dist_op)
-                    parse_op_desc(auto_parallel_main_prog, rank_id,
-                                  reshard_op_desc, var_name, op, dist_context)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
-
-    # insert send and recv op if output process mesh is different from tensor process mesh
-    idx = 0
-    skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"]
-    skip_ops += _g_special_ops
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None and op.type not in skip_ops:
-            for var_name in op.output_arg_names:
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op, False):
-                    for index, item in enumerate(
-                            dist_op.dist_attr.process_mesh.processes):
-                        recv_rank = dist_tensor.dist_attr.process_mesh.processes[
-                            index]
-                        if rank_id == item:
-                            _insert_send_op(block, idx + 1, var, recv_rank)
-                        if rank_id == recv_rank:
-                            _insert_recv_op(block, idx + 1, var, item)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
+        # insert send and recv op if output process mesh is different from tensor process mesh
+        idx = 0
+        # skip reader and ops whose process mesh is union
+        skip_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read", "while",
+            "write_to_array", "read_from_array"
+        ]
+        skip_ops += _g_special_ops
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None and op.type not in skip_ops:
+                for var_name in op.output_arg_names:
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    process_mesh = dist_op.dist_attr.process_mesh
+                    if dist_tensor is not None and _need_reshard(
+                            dist_tensor, dist_op, process_mesh,
+                            auto_parallel_main_prog, dist_context, False):
+                        for index, item in enumerate(
+                                dist_op.dist_attr.process_mesh.processes):
+                            recv_rank = dist_tensor.dist_attr.process_mesh.processes[
+                                index]
+                            if rank_id == item:
+                                _insert_send_op(block, idx + 1, var, recv_rank,
+                                                op.attr('op_role'))
+                            if rank_id == recv_rank:
+                                _insert_recv_op(block, idx + 1, var, item,
+                                                op.attr('op_role'))
+                        cur_op_count = len(block.ops)
+                        idx_offset = idx_offset + cur_op_count - pre_op_count
+                        pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
     # remove no need vars and ops in the main program
     remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 2277c69674b..22692fa5deb 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -32,6 +32,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer
 from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint, load_checkpoint_into_program
 from paddle.distributed.auto_parallel.utils import get_dist_attr, merge_and_slice_parameter, load_parameter_into_program
 from paddle.distributed.auto_parallel.reshard import HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -185,6 +186,7 @@ class TestMLPAutoConvert(unittest.TestCase):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_mp2pp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
@@ -211,6 +213,7 @@ class TestMLPAutoConvert(unittest.TestCase):
                           fetch_list=[loss])
         last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
         global PP_MESH_0
@@ -266,6 +269,7 @@ class TestMLPAutoConvert2(unittest.TestCase):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_pp2mp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
@@ -302,6 +306,7 @@ class TestMLPAutoConvert2(unittest.TestCase):
         if paddle.distributed.get_rank() in [1]:
             last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
 
@@ -345,6 +350,7 @@ class TestMLPAutoConvertInvalid(unittest.TestCase):
         np.random.seed(2021)
 
     def test_input_invalid(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-- 
GitLab


From 857069f3b0c581b83811404f5dcba0d0571b54df Mon Sep 17 00:00:00 2001
From: chenenquan <chenenquan0612@hotmail.com>
Date: Thu, 10 Mar 2022 17:34:48 +0800
Subject: [PATCH 240/272] [PHI] Migrate where_index op (#40255)

* [PHI] Migrate where_index op

* [PHI] Fix where_index infermate

* [Phi] set where_index out data type
---
 paddle/fluid/operators/where_index_op.cc     |  30 ++--
 paddle/fluid/operators/where_index_op.cu     | 164 -----------------
 paddle/fluid/operators/where_index_op.h      |  95 ----------
 paddle/fluid/operators/where_index_op_npu.cc |   5 +-
 paddle/fluid/operators/where_index_op_xpu.cc |  73 --------
 paddle/phi/infermeta/unary.cc                |  11 ++
 paddle/phi/infermeta/unary.h                 |   2 +
 paddle/phi/kernels/cpu/where_index_kernel.cc |  95 ++++++++++
 paddle/phi/kernels/gpu/where_index_kernel.cu | 178 +++++++++++++++++++
 paddle/phi/kernels/where_index_kernel.h      |  26 +++
 paddle/phi/kernels/xpu/where_index_kernel.cc |  72 ++++++++
 11 files changed, 399 insertions(+), 352 deletions(-)
 delete mode 100644 paddle/fluid/operators/where_index_op.cu
 delete mode 100644 paddle/fluid/operators/where_index_op.h
 delete mode 100644 paddle/fluid/operators/where_index_op_xpu.cc
 create mode 100644 paddle/phi/kernels/cpu/where_index_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/where_index_kernel.cu
 create mode 100644 paddle/phi/kernels/where_index_kernel.h
 create mode 100644 paddle/phi/kernels/xpu/where_index_kernel.cc

diff --git a/paddle/fluid/operators/where_index_op.cc b/paddle/fluid/operators/where_index_op.cc
index 2bffeb500ce..733d0f7af92 100644
--- a/paddle/fluid/operators/where_index_op.cc
+++ b/paddle/fluid/operators/where_index_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +24,6 @@ class WhereIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "where");
-    PADDLE_ENFORCE_GE(
-        ctx->GetInputDim("Condition").size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input(Condition) should have number of dimension at least 1"));
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "where");
-    ctx->SetOutputDim("Out", {-1, ctx->GetInputDim("Condition").size()});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -53,11 +46,10 @@ class WhereIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(where_index, ops::WhereIndexOp,
-                             ops::WhereIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(where_index, ops::CPUWhereIndexKernel<int64_t>,
-                       ops::CPUWhereIndexKernel<int>,
-                       ops::CPUWhereIndexKernel<int16_t>,
-                       ops::CPUWhereIndexKernel<bool>,
-                       ops::CPUWhereIndexKernel<float>,
-                       ops::CPUWhereIndexKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(where_index, WhereIndexInferShapeFunctor,
+                            PD_INFER_META(phi::WhereIndexInferMeta));
+REGISTER_OPERATOR(
+    where_index, ops::WhereIndexOp, ops::WhereIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    WhereIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
deleted file mode 100644
index c594e478aa0..00000000000
--- a/paddle/fluid/operators/where_index_op.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
-  }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
-                             const int64_t numel, const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *condition = context.Input<framework::Tensor>("Condition");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
-
-    const T *cond_data = condition->data<T>();
-    const int64_t numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-    auto h_array_mem =
-        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-    // "stride_array" is an array and len(stride_array)==rank,
-    // each element is the stride of each dimension -- the length from i to i+1.
-    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-    // "true_num_array" is an array and len(stride_array)==numel,
-    // at the beginning,
-    // "true_num_array" will set 1 if condition[i] == true else 0,
-    // then it will be calculated by cub::InclusiveSum,
-    // so that we can get the true number before i as the out index
-    int64_t *d_true_num_array = d_stride_array + rank;
-
-    // the total_true_num is the total number of condition[i] == true
-    int64_t *h_total_true_num = h_stride_array + rank;
-
-    // alloce cub memory
-    size_t cub_size = 0;
-    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-    void *cub_data = cub_mem->ptr();
-
-    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-    const int threads = std::min(numel, static_cast<int64_t>(128));
-    const int64_t need_grids = (numel + threads - 1) / threads;
-    const int grids = std::min(need_grids, static_cast<int64_t>(256));
-    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
-                                                           d_true_num_array);
-
-    // calculate the inclusive prefix sum of "true_num_array"
-    // to get the index of "out" tensor,
-    // and the total number of cond_data[i]==true.
-    // Example:
-    // condition: F T T F F F T T
-    // before:    0 1 1 0 0 0 1 1
-    // after:     0 1 2 2 2 2 3 4
-    // out:       1 2 6 7
-    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-
-    // calculate each dimension's stride
-    h_stride_array[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-    }
-    memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(),
-                 h_stride_array, rank * sizeof(int64_t), dev_ctx.stream());
-
-    // get total ture number and set output size
-    // the last element of cub::InclusiveSum is the total number
-    memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(),
-                 d_true_num_array + numel - 1, sizeof(int64_t),
-                 dev_ctx.stream());
-    dev_ctx.Wait();
-
-    int64_t true_num = *h_total_true_num;
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    // using true_num_array and stride_array to calculate the output index
-    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(where_index, ops::CUDAWhereIndexKernel<int64_t>,
-                        ops::CUDAWhereIndexKernel<int>,
-                        ops::CUDAWhereIndexKernel<int16_t>,
-                        ops::CUDAWhereIndexKernel<bool>,
-                        ops::CUDAWhereIndexKernel<float>,
-                        ops::CUDAWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
deleted file mode 100644
index 193a2386e6b..00000000000
--- a/paddle/fluid/operators/where_index_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct WhereIndexFunctor {
-  WhereIndexFunctor(const T* true_index, int true_num, const T* stride,
-                    int rank, T* out)
-      : true_index_(true_index),
-        true_num_(true_num),
-        stride_(stride),
-        rank_(rank),
-        out_ptr_(out) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T index = true_index_[idx];
-    for (int j = 0; j < rank_; j++) {
-      out_ptr_[idx * rank_ + j] = index / stride_[j];
-      index -= out_ptr_[idx * rank_ + j] * stride_[j];
-    }
-  }
-
-  const T* true_index_;
-  int true_num_;
-  const T* stride_;
-  int rank_;
-  T* out_ptr_;
-};
-
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
-
-template <typename T>
-class CPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    std::vector<int64_t> true_index;
-    for (auto i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        true_index.push_back(i);
-      }
-    }
-    auto true_num = true_index.size();
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    std::vector<int64_t> stride(rank);
-    stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      stride[i] = stride[i + 1] * dims[i + 1];
-    }
-
-    auto& dev_ctx = context.template device_context<CPUDeviceContext>();
-    WhereIndexFunctor<int64_t> functor(true_index.data(), true_num,
-                                       stride.data(), rank, out_ptr);
-    platform::ForRange<CPUDeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 59f598d2ad6..2f8744c2c04 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc
deleted file mode 100644
index 3322eefd887..00000000000
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/where_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class WhereIndexXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
-    int true_num_cpu;
-    int ret =
-        xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
-            ret, XPUAPIErrorMsg[ret]));
-
-    memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
-                 context.GetPlace(), static_cast<void*>(true_num),
-                 sizeof(int32_t));
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-    if (true_num_cpu == 0) {
-      return;
-    }
-
-    auto condition_shape = phi::vectorize<int>(dims);
-    ret = xpu::where(dev_ctx.x_context(), cond_data, out_data, condition_shape,
-                     true_num_cpu);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU masked_select kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(where_index, ops::WhereIndexXPUKernel<int>,
-                       ops::WhereIndexXPUKernel<bool>,
-                       ops::WhereIndexXPUKernel<float>);
-#endif
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index f7f8612632b..af035004e4b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1260,6 +1260,17 @@ void EighInferMeta(const MetaTensor& x,
   out_v->set_dims(input_dim);
 }
 
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
+  auto rank = condition.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1UL,
+      phi::errors::InvalidArgument(
+          "Input(Condition) should have number of dimension at least 1"));
+  out->set_dims(phi::make_ddim({-1, rank}));
+  out->set_dtype(DataType::INT64);
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 08d05db1e50..bd79bf9d6ed 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -182,4 +182,6 @@ void EighInferMeta(const MetaTensor& x,
                    MetaTensor* out_w,
                    MetaTensor* out_v);
 
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/where_index_kernel.cc b/paddle/phi/kernels/cpu/where_index_kernel.cc
new file mode 100644
index 00000000000..da6eff74011
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_index_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct WhereIndexFunctor {
+  WhereIndexFunctor(
+      const T* true_index, int true_num, const T* stride, int rank, T* out)
+      : true_index_(true_index),
+        true_num_(true_num),
+        stride_(stride),
+        rank_(rank),
+        out_ptr_(out) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T index = true_index_[idx];
+    for (int j = 0; j < rank_; j++) {
+      out_ptr_[idx * rank_ + j] = index / stride_[j];
+      index -= out_ptr_[idx * rank_ + j] * stride_[j];
+    }
+  }
+
+  const T* true_index_;
+  int true_num_;
+  const T* stride_;
+  int rank_;
+  T* out_ptr_;
+};
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  auto true_num = true_index.size();
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto* out_ptr = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+
+  WhereIndexFunctor<int64_t> functor(
+      true_index.data(), true_num, stride.data(), rank, out_ptr);
+  phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx, true_num);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
new file mode 100644
index 00000000000..535cb812a20
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GetTrueNum(const T *cond_data,
+                           const int64_t numel,
+                           int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    true_num_array[idx] =
+        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+  }
+}
+
+template <typename T>
+__global__ void SetTrueIndex(int64_t *out_ptr,
+                             const T *cond_data,
+                             const int64_t numel,
+                             const int64_t *stride_array,
+                             const int64_t rank,
+                             const int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    // true_num_array is calculated by cub::InclusiveSum,
+    // cause the first element of true_num_array is 1,
+    // so we need substract 1 to get true index.
+    const int64_t true_index = true_num_array[idx] - 1;
+    if (static_cast<bool>(cond_data[idx])) {
+      int64_t rank_index = idx;
+      for (int j = 0; j < rank; j++) {
+        const int64_t out_index = rank_index / stride_array[j];
+        out_ptr[true_index * rank + j] = out_index;
+        rank_index -= out_index * stride_array[j];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context &dev_ctx,
+                      const DenseTensor &condition,
+                      DenseTensor *out) {
+  const T *cond_data = condition.data<T>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  auto d_array_mem =
+      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
+  auto h_array_mem =
+      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
+
+  // "stride_array" is an array and len(stride_array)==rank,
+  // each element is the stride of each dimension -- the length from i to i+1.
+  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
+  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
+
+  // "true_num_array" is an array and len(stride_array)==numel,
+  // at the beginning,
+  // "true_num_array" will set 1 if condition[i] == true else 0,
+  // then it will be calculated by cub::InclusiveSum,
+  // so that we can get the true number before i as the out index
+  int64_t *d_true_num_array = d_stride_array + rank;
+
+  // the total_true_num is the total number of condition[i] == true
+  int64_t *h_total_true_num = h_stride_array + rank;
+
+  // alloce cub memory
+  size_t cub_size = 0;
+  cub::DeviceScan::InclusiveSum(nullptr,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
+  void *cub_data = cub_mem->ptr();
+
+  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
+  const int threads = std::min(numel, static_cast<int64_t>(128));
+  const int64_t need_grids = (numel + threads - 1) / threads;
+  const int grids = std::min(need_grids, static_cast<int64_t>(256));
+  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      cond_data, numel, d_true_num_array);
+
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  cub::DeviceScan::InclusiveSum(cub_data,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+
+  // calculate each dimension's stride
+  h_stride_array[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
+  }
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       d_stride_array,
+                       phi::CPUPlace(),
+                       h_stride_array,
+                       rank * sizeof(int64_t),
+                       dev_ctx.stream());
+
+  // get total ture number and set output size
+  // the last element of cub::InclusiveSum is the total number
+  paddle::memory::Copy(phi::CPUPlace(),
+                       h_total_true_num,
+                       dev_ctx.GetPlace(),
+                       d_true_num_array + numel - 1,
+                       sizeof(int64_t),
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+
+  int64_t true_num = *h_total_true_num;
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  // using true_num_array and stride_array to calculate the output index
+  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/where_index_kernel.h b/paddle/phi/kernels/where_index_kernel.h
new file mode 100644
index 00000000000..68b094637c8
--- /dev/null
+++ b/paddle/phi/kernels/where_index_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/where_index_kernel.cc b/paddle/phi/kernels/xpu/where_index_kernel.cc
new file mode 100644
index 00000000000..f6653e57f6e
--- /dev/null
+++ b/paddle/phi/kernels/xpu/where_index_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
+  int true_num_cpu;
+  int ret = xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      XPU_SUCCESS,
+      phi::errors::External(
+          "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
+          ret,
+          XPUAPIErrorMsg[ret]));
+
+  paddle::memory::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&true_num_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(true_num),
+                       sizeof(int32_t));
+
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
+  auto* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num_cpu == 0) {
+    return;
+  }
+
+  auto condition_shape = phi::vectorize<int>(dims);
+  ret = xpu::where(
+      dev_ctx.x_context(), cond_data, out_data, condition_shape, true_num_cpu);
+  PADDLE_ENFORCE_EQ(ret,
+                    XPU_SUCCESS,
+                    phi::errors::External(
+                        "XPU masked_select kernel return wrong value[%d %s]",
+                        ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where_index, XPU, ALL_LAYOUT, phi::WhereIndexKernel, int, bool, float) {}
-- 
GitLab


From befa78ea3fa9d0dae096a7de91f626b0c31daee8 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Thu, 10 Mar 2022 19:12:42 +0800
Subject: [PATCH 241/272] [phi] move matrix_power op (#40231)

* [phi] move matrix_power op

* MatrixInverse fluid -> phi

* modify the CMake to fix compile bug

* delete useless comment

* mutable memory -> phi Alloc

* modify the include file

* modify the include file

* fix bug in CI compiler
---
 paddle/fluid/operators/matrix_power_op.cc     |  15 +-
 paddle/fluid/operators/matrix_power_op.cu     |  27 --
 paddle/fluid/operators/matrix_power_op.h      | 277 ------------------
 paddle/phi/kernels/CMakeLists.txt             |   4 +-
 .../kernels/cpu/matrix_power_grad_kernel.cc   |  26 ++
 paddle/phi/kernels/cpu/matrix_power_kernel.cc |  22 ++
 .../kernels/gpu/matrix_power_grad_kernel.cu   |  26 ++
 paddle/phi/kernels/gpu/matrix_power_kernel.cu |  22 ++
 .../impl/matrix_power_grad_kernel_impl.h      | 200 +++++++++++++
 .../kernels/impl/matrix_power_kernel_impl.h   | 203 +++++++++++++
 paddle/phi/kernels/matrix_power_grad_kernel.h |  29 ++
 paddle/phi/kernels/matrix_power_kernel.h      |  27 ++
 paddle/phi/ops/compat/matrix_power_sig.cc     |  30 ++
 13 files changed, 592 insertions(+), 316 deletions(-)
 delete mode 100644 paddle/fluid/operators/matrix_power_op.cu
 delete mode 100644 paddle/fluid/operators/matrix_power_op.h
 create mode 100644 paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/matrix_power_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/matrix_power_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 create mode 100644 paddle/phi/kernels/matrix_power_grad_kernel.h
 create mode 100644 paddle/phi/kernels/matrix_power_kernel.h
 create mode 100644 paddle/phi/ops/compat/matrix_power_sig.cc

diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index c65af3129f3..cdf204628b6 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace operators {
@@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/fluid/operators/matrix_power_op.cu
deleted file mode 100644
index d972e9499dc..00000000000
--- a/paddle/fluid/operators/matrix_power_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/matrix_power_op.h"
-
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(matrix_power,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, float>,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, float>,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
deleted file mode 100644
index 8eb9c58513d..00000000000
--- a/paddle/fluid/operators/matrix_power_op.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct IdentityMatrixFunctor {
-  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int row = index / m_ % m_;
-    const int col = index % m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
-                         const paddle::framework::ExecutionContext& ctx) {
-  const auto& x_dims = X->dims();
-  const int x_ndim = x_dims.size();
-  T* out_data = Out->mutable_data<T>(ctx.GetPlace());
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, X->numel());
-
-  if (n == 0) {
-    // Out = Identity Matrix
-    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
-    for_range(functor);
-    return;
-  }
-
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  if (new_n == 1) {
-    framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out);
-    return;
-  }
-
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (new_n == 2) {
-    // Out = newX * newX
-    Out->mutable_data<T>(ctx.GetPlace());
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 3) {
-    // Out = (newX * newX) * newX
-    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
-    // gemm operations must be computable independently; otherwise,
-    // undefined behavior is expected.
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 4) {
-    // Out = (newX * newX) * (newX * newX)
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  }
-
-  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
-  int bit = 0;
-  Tensor z = Tensor(X->dtype());
-  bool out_inited = false;
-  Tensor temp_out = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  Tensor temp_z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  while (new_n > 0) {
-    bit = new_n & 0x1;
-    new_n >>= 1;
-    if (z.IsInitialized()) {
-      blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                  &temp_z, static_cast<T>(0));
-      framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z);
-    } else {
-      z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-      framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z);
-    }
-    if (bit == 1) {
-      if (out_inited == true) {
-        blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                    &temp_out, static_cast<T>(0));
-        framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out);
-      } else {
-        framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out);
-        out_inited = true;
-      }
-    }
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    Tensor* Out = ctx.Output<Tensor>("Out");
-    int n = ctx.Attr<int>("n");
-
-    const auto& x_dims = X->dims();
-    const int x_ndim = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        x_dims[x_ndim - 2], x_dims[x_ndim - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) should be equal."
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            x_dims[x_ndim - 2], x_dims[x_ndim - 1]));
-
-    MatrixPowerFunction<DeviceContext, T>(X, n, Out, ctx);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
-                             const Tensor* dOut, const int n, Tensor* dX,
-                             const paddle::framework::ExecutionContext& ctx) {
-  dX->mutable_data<T>(ctx.GetPlace());
-  const auto& x_dims = X->dims();
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  if (n == 0) {
-    // \nabla X = O
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, dX, static_cast<T>(0));
-    return;
-  } else if (n == 1) {
-    // \nabla X = \nabla Out
-    framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX);
-    return;
-  }
-
-  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (n == -1) {
-    // \nabla X = Out^{T} * \nabla Out * Out^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast<T>(1), dX,
-                static_cast<T>(0));
-    return;
-  }
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  // Use chain rule blow to compute \nabla newX^{n}
-  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
-  // Note that newX^{0} can be omitted
-  std::vector<std::shared_ptr<Tensor>> tensor_list(new_n - 1);
-  tensor_list[0] = std::make_shared<Tensor>(new_x);
-  int index = 1;
-  while (index < new_n - 1) {
-    tensor_list[index] = std::make_shared<Tensor>(
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx));
-    blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc,
-                static_cast<T>(1), tensor_list[index].get(), static_cast<T>(0));
-    index++;
-  }
-
-  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
-  //                      * \nabla Out
-  //                      * (newX^{T}^{n - i - 1})
-  Tensor dx_new = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc,
-              static_cast<T>(1), &dx_new, static_cast<T>(0));
-  Tensor da_an_minus1 =
-      ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc,
-              static_cast<T>(1), &da_an_minus1, static_cast<T>(0));
-  blas.AXPY(X->numel(), static_cast<T>(1), da_an_minus1.data<T>(),
-            dx_new.data<T>());
-  int start = 0;
-  while (start < new_n - 2) {
-    Tensor a_da = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    Tensor a_da_a = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc,
-                static_cast<T>(1), &a_da, static_cast<T>(0));
-    blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start],
-                trans_desc, static_cast<T>(1), &a_da_a, static_cast<T>(0));
-    blas.AXPY(X->numel(), static_cast<T>(1), a_da_a.data<T>(),
-              dx_new.data<T>());
-    start++;
-  }
-
-  if (n > 0) {
-    // \nabla X = \nabla newX
-    framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX);
-  } else {
-    // \nabla X = newX^{T} * \nabla newX * newX^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast<T>(1),
-                dX, static_cast<T>(0));
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    const Tensor* Out = ctx.Input<Tensor>("Out");
-    const Tensor* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const int n = ctx.Attr<int>("n");
-    Tensor* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    MatrixPowerGradFunction<DeviceContext, T>(X, Out, dOut, n, dX, ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 9b4b14bf51e..093cb654979 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
@@ -38,6 +38,8 @@ kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_k
 kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
 kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
 kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
diff --git a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
new file mode 100644
index 00000000000..ae3b4d2b455
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_power_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
new file mode 100644
index 00000000000..f40e1e616f5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
new file mode 100644
index 00000000000..25a9de8f8be
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
new file mode 100644
index 00000000000..d7ae7d8a3f7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
new file mode 100644
index 00000000000..e797b27071c
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+void MatrixPowerGradFunction(const DenseTensor* X,
+                             const DenseTensor* Out,
+                             const DenseTensor* dOut,
+                             const int n,
+                             DenseTensor* dX,
+                             const Context& ctx) {
+  ctx.template Alloc<T>(dX);
+  const auto& x_dims = X->dims();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  if (n == 0) {
+    // \nabla X = O
+    phi::funcs::SetConstant<Context, T> zero;
+    zero(ctx, dX, static_cast<T>(0));
+    return;
+  } else if (n == 1) {
+    // \nabla X = \nabla Out
+    paddle::framework::TensorCopy(*dOut, ctx.GetPlace(), ctx, dX);
+    return;
+  }
+
+  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (n == -1) {
+    // \nabla X = Out^{T} * \nabla Out * Out^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(*Out,
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                *Out,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+    return;
+  }
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  // Use chain rule blow to compute \nabla newX^{n}
+  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
+  // Note that newX^{0} can be omitted
+  std::vector<std::shared_ptr<DenseTensor>> tensor_list(new_n - 1);
+  tensor_list[0] = std::make_shared<DenseTensor>(new_x);
+  int index = 1;
+  while (index < new_n - 1) {
+    DenseTensor tensor_list_index;
+    tensor_list_index.Resize(X->dims());
+    ctx.template Alloc<T>(&tensor_list_index);
+    tensor_list[index] = std::make_shared<DenseTensor>(tensor_list_index);
+
+    blas.MatMul(*tensor_list[index - 1],
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                tensor_list[index].get(),
+                static_cast<T>(0));
+    index++;
+  }
+
+  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
+  //                      * \nabla Out
+  //                      * (newX^{T}^{n - i - 1})
+  DenseTensor dx_new;
+  dx_new.Resize(X->dims());
+  ctx.template Alloc<T>(&dx_new);
+  blas.MatMul(*tensor_list[new_n - 2],
+              trans_desc,
+              *dOut,
+              no_trans_desc,
+              static_cast<T>(1),
+              &dx_new,
+              static_cast<T>(0));
+  DenseTensor da_an_minus1;
+  da_an_minus1.Resize(X->dims());
+  ctx.template Alloc<T>(&da_an_minus1);
+  blas.MatMul(*dOut,
+              no_trans_desc,
+              *tensor_list[new_n - 2],
+              trans_desc,
+              static_cast<T>(1),
+              &da_an_minus1,
+              static_cast<T>(0));
+  blas.AXPY(
+      X->numel(), static_cast<T>(1), da_an_minus1.data<T>(), dx_new.data<T>());
+  int start = 0;
+  while (start < new_n - 2) {
+    DenseTensor a_da;
+    a_da.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da);
+    DenseTensor a_da_a;
+    a_da_a.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da_a);
+    blas.MatMul(*tensor_list[start],
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(1),
+                &a_da,
+                static_cast<T>(0));
+    blas.MatMul(a_da,
+                no_trans_desc,
+                *tensor_list[new_n - 3 - start],
+                trans_desc,
+                static_cast<T>(1),
+                &a_da_a,
+                static_cast<T>(0));
+    blas.AXPY(
+        X->numel(), static_cast<T>(1), a_da_a.data<T>(), dx_new.data<T>());
+    start++;
+  }
+
+  if (n > 0) {
+    // \nabla X = \nabla newX
+    paddle::framework::TensorCopy(dx_new, ctx.GetPlace(), ctx, dX);
+  } else {
+    // \nabla X = newX^{T} * \nabla newX * newX^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(new_x,
+                trans_desc,
+                dx_new,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                new_x,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad) {
+  auto X = &x;
+  auto Out = &out;
+  auto dOut = &out_grad;
+  auto dX = x_grad;
+
+  MatrixPowerGradFunction<Context, T>(X, Out, dOut, n, dX, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
new file mode 100644
index 00000000000..ccc5e8757e8
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename T>
+struct IdentityMatrixFunctor {
+  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int row = index / m_ % m_;
+    const int col = index % m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_;
+  T* output_;
+};
+
+template <typename Context, typename T>
+void MatrixPowerFunction(const DenseTensor* X,
+                         const int n,
+                         DenseTensor* Out,
+                         const Context& ctx) {
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  T* out_data = ctx.template Alloc<T>(Out);
+
+  phi::funcs::ForRange<Context> for_range(ctx, X->numel());
+
+  if (n == 0) {
+    // Out = Identity Matrix
+    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
+    for_range(functor);
+    return;
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  if (new_n == 1) {
+    paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, Out);
+    return;
+  }
+
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (new_n == 2) {
+    // Out = newX * newX
+    ctx.template Alloc<T>(Out);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 3) {
+    // Out = (newX * newX) * newX
+    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
+    // gemm operations must be computable independently; otherwise,
+    // undefined behavior is expected.
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 4) {
+    // Out = (newX * newX) * (newX * newX)
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                temp,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  }
+
+  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
+  int bit = 0;
+  DenseTensor z = DenseTensor(X->dtype());
+  bool out_inited = false;
+  DenseTensor temp_out;
+  temp_out.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_out);
+  DenseTensor temp_z;
+  temp_z.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_z);
+  while (new_n > 0) {
+    bit = new_n & 0x1;
+    new_n >>= 1;
+    if (z.IsInitialized()) {
+      blas.MatMul(z,
+                  no_trans_desc,
+                  z,
+                  no_trans_desc,
+                  static_cast<T>(1),
+                  &temp_z,
+                  static_cast<T>(0));
+      paddle::framework::TensorCopy(temp_z, ctx.GetPlace(), ctx, &z);
+    } else {
+      z.Resize(X->dims());
+      ctx.template Alloc<T>(&z);
+      paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, &z);
+    }
+    if (bit == 1) {
+      if (out_inited == true) {
+        blas.MatMul(*Out,
+                    no_trans_desc,
+                    z,
+                    no_trans_desc,
+                    static_cast<T>(1),
+                    &temp_out,
+                    static_cast<T>(0));
+        paddle::framework::TensorCopy(temp_out, ctx.GetPlace(), ctx, Out);
+      } else {
+        paddle::framework::TensorCopy(z, ctx.GetPlace(), ctx, Out);
+        out_inited = true;
+      }
+    }
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out) {
+  const DenseTensor* X = &x;
+  auto Out = out;
+
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_ndim - 2],
+      x_dims[x_ndim - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) should be equal."
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          x_dims[x_ndim - 2],
+          x_dims[x_ndim - 1]));
+
+  MatrixPowerFunction<Context, T>(X, n, Out, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_grad_kernel.h b/paddle/phi/kernels/matrix_power_grad_kernel.h
new file mode 100644
index 00000000000..4f70cf6e34d
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_kernel.h b/paddle/phi/kernels/matrix_power_kernel.h
new file mode 100644
index 00000000000..39a1bc85e3f
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/matrix_power_sig.cc b/paddle/phi/ops/compat/matrix_power_sig.cc
new file mode 100644
index 00000000000..4c9ad4e74ab
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_power_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MatrixPowerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("matrix_power_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"n"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad,
+                           phi::MatrixPowerGradOpArgumentMapping);
-- 
GitLab


From bd4dc3be34584f9b273ecec07297fb05e1cf4c52 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Thu, 10 Mar 2022 20:10:16 +0800
Subject: [PATCH 242/272] solve unexecuted UT (#40397)

---
 paddle/fluid/imperative/prepared_operator.cc    |  1 +
 paddle/fluid/platform/device/xpu/xpu_op_list.cc | 16 ++++++++++++++++
 paddle/fluid/platform/device/xpu/xpu_op_list.h  |  2 ++
 paddle/fluid/pybind/pybind.cc                   |  7 +++++++
 4 files changed, 26 insertions(+)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2317bfdd7c0..bae49fb381a 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
+  expected_kernel_key.place_ = platform::XPUPlace();
   bool use_xpu_kp_kernel_rt =
       FLAGS_run_kp_kernel &&
       paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index b20e8ac9785..07385143362 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -111,6 +111,22 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU_KP
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version) {
+  std::vector<vartype::Type> res;
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                              : get_kp_ops();
+  if (ops.find(op_name) != ops.end()) {
+    XPUKernelSet& type_set = ops[op_name];
+    for (auto& item : type_set) {
+      res.push_back(item.data_type_);
+    }
+  }
+  return res;
+}
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 455a38e36fe..60926dd9a56 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -31,6 +31,8 @@ bool is_in_xpu_black_list(const std::string& op_name);
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version);
 #endif
 
 std::vector<vartype::Type> get_xpu_op_support_type(
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 566e38b7a21..1c5b30fe087 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1957,10 +1957,17 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
   m.def("get_xpu_device_op_support_types",
         [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
           return platform::get_xpu_op_support_type(op_name, version);
         });
+#endif
   m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
-- 
GitLab


From 431afc3971bc6d012fa7bbf8097db3f19e01d422 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Thu, 10 Mar 2022 20:50:24 +0800
Subject: [PATCH 243/272] Inference add ONNXRuntime  back-end (#39988)

* add onnxruntime predictor

* Add code comments

* support link paddle2onnx onnxruntime

* support onnxruntime with python

* support onnxruntime with python

* support onnxruntime with windows

* paddle2onnx compile with windows

* supoort windows compile

* supoort windows compile with onnxruntime

* supoort windows compile with paddle2onnx

* supoort mac compile

* compile with mac

* compile with mac

* add code comments

* fix remind word

* code optimization

* add test case

* add test case

* add inference demo_ci test case

* fix compile paddle2onnx with no python

* add inference demo_ci test case

* add inference demo_ci test case

* add inference infer_ut test case

* support c go api and test cases

* add converage test case

* add converage test case

* add capi test case

* add capi test case
---
 CMakeLists.txt                                |   1 +
 cmake/external/onnxruntime.cmake              |  94 +++++
 cmake/external/paddle2onnx.cmake              |  96 +++++
 cmake/external/protobuf.cmake                 |  10 +-
 cmake/inference_lib.cmake                     |  18 +
 cmake/third_party.cmake                       |   6 +
 paddle/fluid/inference/CMakeLists.txt         |  12 +
 paddle/fluid/inference/api/CMakeLists.txt     |  21 +-
 paddle/fluid/inference/api/analysis_config.cc |  27 ++
 .../fluid/inference/api/analysis_predictor.cc |  25 ++
 .../api/analysis_predictor_tester.cc          |  26 ++
 .../inference/api/demo_ci/CMakeLists.txt      |  27 ++
 .../api/demo_ci/onnxruntime_mobilenet_demo.cc |  64 ++++
 paddle/fluid/inference/api/demo_ci/run.sh     |  64 +++-
 .../inference/api/onnxruntime_predictor.cc    | 354 ++++++++++++++++++
 .../inference/api/onnxruntime_predictor.h     | 225 +++++++++++
 .../api/onnxruntime_predictor_tester.cc       |  84 +++++
 .../inference/api/paddle_analysis_config.h    |  29 ++
 paddle/fluid/inference/api/paddle_api.h       |   7 +
 paddle/fluid/inference/capi_exp/pd_config.cc  |  20 +
 paddle/fluid/inference/capi_exp/pd_config.h   |  28 ++
 paddle/fluid/inference/goapi/config.go        |  30 ++
 paddle/fluid/inference/goapi/config_test.go   |  17 +
 .../fluid/inference/goapi/predictor_test.go   |  36 ++
 paddle/fluid/inference/goapi/test.sh          |   1 +
 .../api/analyzer_capi_exp_pd_config_tester.cc |  12 +
 .../inference/tests/infer_ut/CMakeLists.txt   |  26 ++
 paddle/fluid/inference/tests/infer_ut/run.sh  |   9 +-
 paddle/fluid/inference/tests/test.cmake       |   8 +
 paddle/fluid/pybind/CMakeLists.txt            |  43 +++
 paddle/fluid/pybind/inference_api.cc          |   8 +
 paddle/scripts/paddle_build.sh                |  12 +-
 python/setup.py.in                            |  12 +
 33 files changed, 1432 insertions(+), 20 deletions(-)
 create mode 100644 cmake/external/onnxruntime.cmake
 create mode 100644 cmake/external/paddle2onnx.cmake
 create mode 100644 paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
 create mode 100644 paddle/fluid/inference/api/onnxruntime_predictor.cc
 create mode 100644 paddle/fluid/inference/api/onnxruntime_predictor.h
 create mode 100644 paddle/fluid/inference/api/onnxruntime_predictor_tester.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c5f711d291..6988434996b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
new file mode 100644
index 00000000000..2162f87812d
--- /dev/null
+++ b/cmake/external/onnxruntime.cmake
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if (NOT WITH_ONNXRUNTIME)
+  return()
+endif ()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+add_definitions(-DPADDLE_WITH_ONNXRUNTIME)
+
+SET(ONNXRUNTIME_PROJECT        "extern_onnxruntime")
+SET(ONNXRUNTIME_PREFIX_DIR     ${THIRD_PARTY_PATH}/onnxruntime)
+SET(ONNXRUNTIME_SOURCE_DIR     ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
+SET(ONNXRUNTIME_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/onnxruntime)
+SET(ONNXRUNTIME_INC_DIR        "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE)
+SET(ONNXRUNTIME_LIB_DIR        "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE)
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
+
+
+if (WIN32)
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip")
+elseif (APPLE)
+  SET(ONNXRUNTIME_URL           "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz")
+else ()
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz")
+endif()
+
+
+INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers.
+if (WIN32)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+elseif (APPLE)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+else ()
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+endif ()
+
+if (WIN32)
+  ExternalProject_Add(
+      ${ONNXRUNTIME_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      URL                 ${ONNXRUNTIME_URL}
+      PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} &&
+                            ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
+                            ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+      BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+else ()
+  ExternalProject_Add(
+    ${ONNXRUNTIME_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                 ${ONNXRUNTIME_URL}
+    PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS  1
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND        ""
+    INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
+                          ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+    BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+endif()
+
+ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
+ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT})
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
new file mode 100644
index 00000000000..661c3675c84
--- /dev/null
+++ b/cmake/external/paddle2onnx.cmake
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_ONNXRUNTIME)
+  return()
+endif()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+SET(PADDLE2ONNX_PROJECT        "extern_paddle2onnx")
+SET(PADDLE2ONNX_PREFIX_DIR     ${THIRD_PARTY_PATH}/paddle2onnx)
+SET(PADDLE2ONNX_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/paddle2onnx)
+SET(PADDLE2ONNX_INC_DIR        "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE)
+SET(PADDLE2ONNX_REPOSITORY     ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
+SET(PADDLE2ONNX_TAG            cpp)
+SET(LIBDIR "lib")
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
+
+INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers.
+if(WIN32)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE)
+    SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE)
+elseif(APPLE)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+else()
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+endif(WIN32)
+
+
+# The protoc path is required to compile onnx.
+string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
+list(POP_BACK PROTOC_BIN_PATH)
+list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
+
+
+set(PADDLE2ONNX_OPTIONAL_ARGS
+      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+      -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
+      -DWITH_STATIC=OFF
+      -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+      ${EXTERNAL_OPTIONAL_ARGS}
+)
+
+if (WITH_PYTHON)
+  set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
+    -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
+    -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}
+  )
+endif ()
+
+
+ExternalProject_Add(
+    ${PADDLE2ONNX_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY      ${PADDLE2ONNX_REPOSITORY}
+    GIT_TAG             ${PADDLE2ONNX_TAG}
+    DEPENDS             protobuf
+    PREFIX              ${PADDLE2ONNX_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS       ${PADDLE2ONNX_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS    ${PADDLE2ONNX_LIB}
+)
+
+ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index f7cb7716969..58ff5f0d2b7 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-    if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+
+    if(WITH_ONNXRUNTIME)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         v3.18.0)
+    elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
         SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
         SET(PROTOBUF_TAG         v3.8.0)
     elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
@@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ONNXRUNTIME)
+    SET(PROTOBUF_VERSION 3.18.0)
+elseif(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
     SET(PROTOBUF_VERSION 3.6.1)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c48d31f7e4f..851bd81403a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST)
         endif()
     endif()
 
+    if (WITH_ONNXRUNTIME)
+        set(dst_dir "${DST}/third_party/install/onnxruntime")
+        copy(${TARGET}
+                SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
+                DSTS ${dst_dir} ${dst_dir})
+
+        set(dst_dir "${DST}/third_party/install/paddle2onnx")
+        if(WIN32)
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
+        else()
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib)
+        endif()
+    endif()
+
     set(dst_dir "${DST}/third_party/install/gflags")
     copy(${TARGET}
             SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index ac3eff04d53..7df095c6c2e 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE)
     list(APPEND third_party_deps extern_gtest)
 ENDIF()
 
+if(WITH_ONNXRUNTIME)
+    include(external/onnxruntime)            # download, build, install onnxruntime、paddle2onnx
+    include(external/paddle2onnx)          
+    list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
+endif()
+
 if(WITH_GPU)
     if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
         include(external/cub)       # download cub
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 26b8b9e8e17..5d0c3c98d2f 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -45,6 +45,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
+
+if(WITH_ONNXRUNTIME)
+  set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor)
+endif()
+
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
@@ -91,6 +96,13 @@ if (WITH_PSCORE)
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service)
 endif ()
 
+if (WITH_ONNXRUNTIME)
+  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} 
+      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc
+  )
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor)
+endif (WITH_ONNXRUNTIME)
+
 # Create shared inference library
 cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
     DEPS ${SHARED_INFERENCE_DEPS})
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 1f83e606c3f..bdc16ef4c79 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+if (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
+    cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor)
+else (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+endif (WITH_ONNXRUNTIME)
+
 
 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
@@ -75,6 +82,16 @@ elseif (WIN32)
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if (NOT APPLE AND NOT WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  elseif (WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps}
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  endif()
+endif()
+
 if(WITH_TESTING AND WITH_MKLDNN)
   if (NOT APPLE AND NOT WIN32)
     cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 9c33d700306..41c01d3b7e2 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
   Update();
 }
 
+void AnalysisConfig::EnableONNXRuntime() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  use_onnxruntime_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()";
+  use_onnxruntime_ = false;
+#endif
+
+  Update();
+}
+
+void AnalysisConfig::DisableONNXRuntime() {
+  use_onnxruntime_ = false;
+  Update();
+}
+
+void AnalysisConfig::EnableORTOptimization() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  enable_ort_optimization_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()";
+  enable_ort_optimization_ = false;
+#endif
+
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index df61b510319..871ed596a3e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -65,6 +65,10 @@
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #endif
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
@@ -1762,6 +1766,27 @@ namespace paddle_infer {
 Predictor::Predictor(const Config &config) {
   const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
   // The second parameter indicates that the discard log is not printed
+  if (config.use_onnxruntime()) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+    if (config.use_gpu()) {
+      LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU,"
+                      "and it falls back to use Paddle Inference.";
+    } else if (!paddle::CheckConvertToONNX(config)) {
+      LOG(WARNING)
+          << "Paddle2ONNX do't support convert the Model， fall back to using "
+             "Paddle Inference.";
+    } else {
+      predictor_ = paddle::CreatePaddlePredictor<
+          Config, paddle::PaddleEngineKind::kONNXRuntime>(config);
+      return;
+    }
+#else
+    LOG(WARNING)
+        << "The onnxruntime backend isn't enabled,"
+           " and please re-compile Paddle with WITH_ONNXRUNTIME option,"
+           "fall back to using Paddle Inference.";
+#endif
+  }
   predictor_ = paddle::CreatePaddlePredictor<
       Config, paddle::PaddleEngineKind::kAnalysis>(config);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 9c7e5c6b27e..2c6e8f4f1a4 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -357,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) {
 }
 #endif
 
+TEST(AnalysisPredictor, enable_onnxruntime) {
+  AnalysisConfig config;
+  config.EnableONNXRuntime();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.use_onnxruntime());
+#else
+  ASSERT_TRUE(!config.use_onnxruntime());
+#endif
+  config.EnableORTOptimization();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.ort_optimization_enabled());
+#else
+  ASSERT_TRUE(!config.ort_optimization_enabled());
+#endif
+  config.DisableONNXRuntime();
+  ASSERT_TRUE(!config.use_onnxruntime());
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -408,6 +426,14 @@ TEST(Predictor, Run) {
   predictor->TryShrinkMemory();
 }
 
+TEST(Predictor, EnableONNXRuntime) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  auto predictor = CreatePredictor(config);
+}
+
 TEST(Tensor, CpuShareExternalData) {
   Config config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index d03840ada36..df98a7b05cf 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -4,6 +4,7 @@ option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL.
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -151,6 +159,17 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -213,6 +232,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
new file mode 100644
index 00000000000..ef5c08cd041
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <vector>
+#include "gflags/gflags.h"
+#include "utils.h"  // NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+
+namespace paddle {
+namespace demo {
+
+/*
+ * Use the onnxruntime engine to inference the demo.
+ */
+void Main() {
+  paddle::AnalysisConfig config;
+  config.EnableONNXRuntime();
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  auto predictor = paddle_infer::CreatePredictor(config);
+
+  // Inference.
+  std::vector<int> input_shape = {1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(input_shape);
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  predictor->Run();
+  output_tensor->CopyToCpu(out_data.data());
+
+  VLOG(3) << "output.size " << out_data.size();
+}
+
+}  // namespace demo
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 5f062e80632..79a31555c7f 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 USE_TENSORRT=$5
 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr
-MSVC_STATIC_CRT=$7
+WITH_ONNXRUNTIME=$7
+MSVC_STATIC_CRT=$8
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
@@ -38,6 +39,26 @@ else
   use_gpu_list='false'
 fi
 
+mkdir -p $DATA_DIR
+cd $DATA_DIR
+
+if [ $7 == ON ]; then
+  ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB}
+  PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB}
+  #download model
+  mkdir -p MobileNetV2
+  cd MobileNetV2
+  if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
+    echo "MobileNetV2.inference.model.tar.gz has been downloaded."
+  else
+    wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    tar xzf *.tar.gz
+  fi
+  cd ..
+fi
+
 PREFIX=inference-vis-demos%2F
 URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
@@ -58,8 +79,7 @@ function download() {
   fi
   cd ..
 }
-mkdir -p $DATA_DIR
-cd $DATA_DIR
+
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
   download $vis_demo_name
@@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
@@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
       Release/trt_mobilenet_demo.exe \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
     if [ -d $word2vec_model ]; then
@@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_GPU=$TEST_GPU_CPU \
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       make -j$(nproc)
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do
         exit 1
       fi
     fi
+
+    # --------onnxruntime mobilenetv2 on linux/mac------
+    if [ $WITH_ONNXRUNTIME == ON ]; then
+      rm -rf *
+      cmake .. -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=onnxruntime_mobilenet_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
+      make -j$(nproc)
+      ./onnxruntime_mobilenet_demo \
+        --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
+      if [ $? -ne 0 ]; then
+        echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail."
+        exit 1
+      fi
+    fi
   fi
 done
 set +x
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
new file mode 100644
index 00000000000..ee82da139d8
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -0,0 +1,354 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid//platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+framework::proto::VarType::Type ConvertONNXType(
+    ONNXTensorElementDataType type) {
+  switch (type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return framework::proto::VarType::FP32;
+    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+    //   return DataType::FP16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+      return framework::proto::VarType::INT8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return framework::proto::VarType::INT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return framework::proto::VarType::INT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+      return framework::proto::VarType::UINT8;
+    default:
+      LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
+      return framework::proto::VarType::FP32;
+  }
+}
+
+bool CheckConvertToONNX(const AnalysisConfig &config) {
+  if (!config.model_dir().empty()) {
+    LOG(ERROR) << "Paddle2ONNX not support model_dir config";
+    // TODO(heliqi jiangjiajun): Paddle2ONNX not support
+    // config.model_dir() + "/__model__"
+    // config.model_dir() + var_name
+    return false;
+  } else if (config.prog_file().empty() || config.params_file().empty()) {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s' or params path '%s'.",
+        config.model_dir(), config.prog_file(), config.params_file());
+    return false;
+  }
+  return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
+                                   config.model_from_memory());
+}
+
+bool ONNXRuntimePredictor::Init() {
+  VLOG(3) << "ONNXRuntime Predictor::init()";
+
+  // Now ONNXRuntime only suuport CPU
+  if (config_.use_gpu()) {
+    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  scope_.reset(new paddle::framework::Scope());
+  sub_scope_ = &scope_->NewScope();
+
+  std::string onnx_proto;
+  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
+                      config_.model_from_memory());
+
+  Ort::SessionOptions session_options;
+  if (config_.ort_optimization_enabled()) {
+    session_options.SetGraphOptimizationLevel(
+        GraphOptimizationLevel::ORT_ENABLE_ALL);
+  }
+  // Turn optimization off first, and then turn it on when it's stable
+  // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+  // session_options.EnableCpuMemArena();
+  // session_options.EnableMemPattern();
+  // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads());
+  session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads());
+  VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads();
+  if (config_.profile_enabled()) {
+    LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the "
+                    "performance";
+#if defined(_WIN32)
+    session_options.EnableProfiling(L"ONNX");
+#else
+    session_options.EnableProfiling("ONNX");
+#endif
+  } else {
+    VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report "
+               "will be "
+               "generated.";
+  }
+  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+
+  auto memory_info =
+      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::Allocator allocator(session_, memory_info);
+
+  framework::proto::VarType::Type proto_type =
+      framework::proto::VarType::LOD_TENSOR;
+  size_t n_inputs = session_.GetInputCount();
+  for (size_t i = 0; i < n_inputs; ++i) {
+    auto input_name = session_.GetInputName(i, allocator);
+    auto type_info = session_.GetInputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
+    auto *ptr = scope_->Var(input_name);
+    framework::InitializeVariable(ptr, proto_type);
+    allocator.Free(input_name);
+  }
+
+  size_t n_outputs = session_.GetOutputCount();
+  for (size_t i = 0; i < n_outputs; ++i) {
+    auto output_name = session_.GetOutputName(i, allocator);
+    auto type_info = session_.GetOutputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
+    auto *ptr = scope_->Var(output_name);
+    framework::InitializeVariable(ptr, proto_type);
+    allocator.Free(output_name);
+  }
+
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig &config) {
+  if (config.glog_info_disabled()) {
+    FLAGS_logtostderr = 1;
+    FLAGS_minloglevel = 2;  // GLOG_ERROR
+  }
+
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
+
+  VLOG(3) << "create ONNXRuntimePredictor";
+
+  std::unique_ptr<PaddlePredictor> predictor(new ONNXRuntimePredictor(config));
+  // Each config can only be used for one predictor.
+  config.SetInValid();
+  auto predictor_p = dynamic_cast<ONNXRuntimePredictor *>(predictor.get());
+
+  if (!predictor_p->Init()) {
+    return nullptr;
+  }
+
+  return predictor;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto input_desc : input_desc_) {
+    input_names.push_back(input_desc.name);
+  }
+  return input_names;
+}
+
+std::map<std::string, std::vector<int64_t>>
+ONNXRuntimePredictor::GetInputTensorShape() {
+  std::map<std::string, std::vector<int64_t>> input_shapes;
+  for (auto input_desc : input_desc_) {
+    input_shapes[input_desc.name] = input_desc.shape;
+  }
+  return input_shapes;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto output_desc : output_desc_) {
+    output_names.push_back(output_desc.name);
+  }
+  return output_names;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The in variable named %s is not found in the "
+                              "scope of the ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  return res;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The out variable named %s is not found in the "
+                              "scope of the ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  return res;
+}
+
+Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
+                                             const char *device_name) {
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  size_t size =
+      tensor->numel() *
+      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
+  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
+  return Ort::Value::CreateTensor(memory_info,
+                                  static_cast<void *>(tensor->data()), size,
+                                  shape.data(), shape.size(), desc.dtype);
+}
+
+void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
+                                    const ONNXDesc &desc) {
+  auto info = value.GetTensorTypeAndShapeInfo();
+
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(phi::make_ddim(info.GetShape()));
+  auto dtype = ConvertONNXType(info.GetElementType());
+  auto *ptr = tensor->mutable_data(place_, dtype);
+
+  if (platform::is_cpu_place(place_)) {
+    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
+                tensor->numel() * framework::SizeOfType(dtype));
+  } else {
+    auto src_place = place_;
+    auto dst_place = place_;
+    memory::Copy(dst_place, ptr, src_place,
+                 const_cast<void *>(value.GetTensorData<void>()),
+                 tensor->numel() * framework::SizeOfType(dtype));
+  }
+}
+
+bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                               std::vector<PaddleTensor> *output_data,
+                               int batch_size) {
+  LOG(ERROR) << "Not support Run";
+  return false;
+}
+
+bool ONNXRuntimePredictor::ZeroCopyRun() {
+  try {
+    Ort::IoBinding binding(session_);
+    std::vector<Ort::Value> inputs;
+    std::vector<Ort::Value> outputs;
+    Ort::RunOptions options;
+
+    inputs.reserve(input_desc_.size());
+    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
+    for (auto desc : input_desc_) {
+      inputs.push_back(GetOrtValue(desc, device_name));
+      binding.BindInput(desc.name.c_str(), inputs.back());
+    }
+
+    // TODO(heliqi): Optimization —— move to  Init()
+    for (auto desc : output_desc_) {
+      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                                  place_.GetDeviceId(), OrtMemTypeDefault);
+      binding.BindOutput(desc.name.c_str(), memory_info);
+    }
+
+    session_.Run({}, binding);
+
+    outputs = binding.GetOutputValues();
+    for (size_t i = 0; i < output_desc_.size(); ++i) {
+      AsTensor(outputs[i], output_desc_[i]);
+    }
+  } catch (const std::exception &e) {
+    LOG(ERROR) << e.what();
+    return false;
+  }
+
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone() {
+  LOG(ERROR) << "Not support Clone(), Please create new Predictor";
+  return nullptr;
+}
+
+uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
+  return paddle::memory::Release(place_);
+}
+
+ONNXRuntimePredictor::~ONNXRuntimePredictor() {
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+  memory::Release(place_);
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
new file mode 100644
index 00000000000..7fb07aa97bd
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -0,0 +1,225 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_compatible_info.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#include "paddle2onnx/converter.h"
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
+
+///
+/// \file onnxruntime_predictor.h
+///
+/// \brief A predictor using ONNXRuntime
+///
+/// \author heliqi@baidu.com
+/// \date 2022-02-14
+/// \since 2.3.0
+///
+
+namespace paddle {
+
+bool CheckConvertToONNX(const AnalysisConfig &config);
+
+struct ONNXDesc {
+  std::string name;
+  std::vector<int64_t> shape;
+  ONNXTensorElementDataType dtype;
+};
+
+///
+/// \class ONNXRuntimePredictor
+///
+/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePaddlePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output tensors
+/// \code{cpp}
+///   auto input_t = predictor->GetInputTensor(input_names[0]);
+///   auto output_t = predictor->GetOutputTensor(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->ZeroCopyRun();
+/// \endcode
+///
+class ONNXRuntimePredictor : public PaddlePredictor {
+ public:
+  ///
+  /// \brief Construct a new ONNXRuntime Predictor object
+  ///
+  /// \param[in] AnalysisConfig config
+  ///
+  explicit ONNXRuntimePredictor(const AnalysisConfig &config)
+      : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
+  }
+  ///
+  /// \brief Destroy the ONNXRuntime Predictor object
+  ///
+  ~ONNXRuntimePredictor();
+
+  ///
+  /// \brief Initialize predictor
+  ///
+  /// \return Whether the init function executed successfully
+  ///
+  bool Init();
+
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
+  std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
+  std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+  ///
+  /// \brief Get all input names and their corresponding shapes
+  ///
+  /// \return the map of input names and shapes
+  ///
+  std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
+
+  /// Not supoort
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
+
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
+  bool ZeroCopyRun() override;
+
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  std::shared_ptr<framework::Scope> scope_;
+
+ private:
+  ///
+  /// \brief get the Ort Value(input Tensor).
+  ///
+  /// \param[in] desc ONNXDesce(name、shape、dtype)
+  ///
+  /// \param[in] device_name "cpu" or "gpu" of device
+  ///
+  /// \return get a Ort::Value
+  ///
+  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
+
+  ///
+  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  ///
+  /// \param[in] value Ort::Value(output Tensor)
+  ///
+  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  ///
+  /// \return get a Ort::Value
+  ///
+  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+
+ private:
+  AnalysisConfig config_;
+
+  // ONNXRuntime
+  Ort::Env env_;
+  Ort::Session session_{nullptr};
+
+  platform::Place place_;
+  framework::Scope *sub_scope_{nullptr};
+  std::vector<ONNXDesc> input_desc_;
+  std::vector<ONNXDesc> output_desc_;
+  int predictor_id_;
+
+// Some more detailed tests, they are made the friends of the predictor, so that
+// the all the details can be tested.
+#if PADDLE_WITH_TESTING
+  FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on);
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
new file mode 100644
index 00000000000..2be2de9c60b
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+DEFINE_string(dirname, "", "dirname to tests.");
+
+namespace paddle {
+
+TEST(ONNXRuntimePredictor, onnxruntime_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname + "/inference.pdmodel",
+                  FLAGS_dirname + "/inference.pdiparams");
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  config.SetCpuMathLibraryNumThreads(2);
+  LOG(INFO) << config.Summary();
+
+  auto _predictor =
+      CreatePaddlePredictor<AnalysisConfig,
+                            paddle::PaddleEngineKind::kONNXRuntime>(config);
+  ASSERT_TRUE(_predictor);
+  auto* predictor = static_cast<ONNXRuntimePredictor*>(_predictor.get());
+
+  ASSERT_TRUE(predictor);
+  ASSERT_TRUE(!predictor->Clone());
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // Dummy Input Data
+  std::vector<int64_t> input_shape = {-1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+
+  // testing all interfaces
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto get_input_shape = predictor->GetInputTensorShape();
+
+  ASSERT_EQ(input_names.size(), 1UL);
+  ASSERT_EQ(output_names.size(), 1UL);
+  ASSERT_EQ(input_names[0], "inputs");
+  ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1");
+  ASSERT_EQ(get_input_shape["inputs"], input_shape);
+
+  auto input_tensor = predictor->GetInputTensor(input_names[0]);
+  input_tensor->Reshape({1, 3, 224, 224});
+  auto output_tensor = predictor->GetOutputTensor(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+  output_tensor->CopyToCpu(out_data.data());
+
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index b4a35839440..7b765e3fa8a 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -319,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableNpu(int device_id = 0);
   ///
+  /// \brief Turn on ONNXRuntime.
+  ///
+  void EnableONNXRuntime();
+  ///
+  /// \brief Turn off ONNXRuntime.
+  ///
+  void DisableONNXRuntime();
+  ///
+  /// \brief Turn on ONNXRuntime Optimization.
+  ///
+  void EnableORTOptimization();
+  ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
   /// \return bool Whether the GPU is turned on.
@@ -342,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool use_ipu() const { return use_ipu_; }
   ///
+  /// \brief A boolean state telling whether the ONNXRuntime is turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime is turned on.
+  ///
+  bool use_onnxruntime() const { return use_onnxruntime_; }
+  ///
+  /// \brief A boolean state telling whether the ONNXRuntime Optimization is
+  /// turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime Optimization is turned on.
+  ///
+  bool ort_optimization_enabled() const { return enable_ort_optimization_; }
+  ///
   /// \brief Get the GPU device id.
   ///
   /// \return int The GPU device id.
@@ -841,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_npu_{false};
   int npu_device_id_{0};
 
+  // ONNXRuntime related
+  bool use_onnxruntime_{false};
+  bool enable_ort_optimization_{false};
+
   // Padding related
   bool use_fc_padding_{true};
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index c129efe494b..657dd9b600c 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
 
  private:
   friend class AnalysisPredictor;
+  friend class ONNXRuntimePredictor;
   explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {}
 };
 
@@ -381,6 +382,7 @@ enum class PaddleEngineKind {
   kNative = 0,         ///< Use the native Fluid facility.
   kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
   kAnalysis,           ///< More optimization.
+  kONNXRuntime,        ///< Use ONNXRuntime
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
@@ -395,6 +397,11 @@ template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
 
+template <>
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig& config);
+
 PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);
 
 PD_INFER_DECL std::string get_version();
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index e342190fda1..d7b07652bab 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
   return config->use_gpu();
 }
 
+void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableONNXRuntime();
+}
+
+void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableONNXRuntime();
+}
+
+PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_onnxruntime();
+}
+
+void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableORTOptimization();
+}
+
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                         int32_t l3_workspace_size, PD_Bool locked,
                         PD_Bool autotune, const char* autotune_file,
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index c314aca918f..f6b754cad21 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
     __pd_keep PD_Config* pd_config);
 ///
+/// \brief Turn on ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the ONNXRutnime is turned on.
+///
+/// \return Whether the ONNXRuntime is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Turn on XPU.
 ///
 /// \param[in] pd_onfig config
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index def26913b0a..8f9f34c06b4 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
 	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
 }
 
+///
+/// \brief Turn on ONNXRuntime.
+///
+func (config *Config) EnableONNXRuntime() {
+	C.PD_ConfigEnableONNXRuntime(config.c)
+}
+
+///
+/// \brief Turn off ONNXRuntime.
+///
+func (config *Config) DisableONNXRuntime() {
+	C.PD_ConfigDisableONNXRuntime(config.c)
+}
+
+///
+/// \brief A boolean state telling whether the ONNXRuntime is turned on.
+///
+/// \return bool Whether the ONNXRuntime is turned on.
+///
+func (config *Config) ONNXRuntimeEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c))
+}
+
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+func (config *Config) EnableORTOptimization() {
+	C.PD_ConfigEnableORTOptimization(config.c)
+}
+
 ///
 /// \brief Turn on XPU.
 ///
diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go
index b8216188083..297841dcbcf 100644
--- a/paddle/fluid/inference/goapi/config_test.go
+++ b/paddle/fluid/inference/goapi/config_test.go
@@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) {
 
 	config.SetBfloat16Op([]string{"fc", "mul"})
 }
+
+func TestONNXRuntime(t *testing.T) {
+	config := NewConfig()
+	config.SetModelDir("modelDir")
+	t.Log(config.ModelDir())
+
+	config.EnableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.DisableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.EnableORTOptimization()
+
+	config.SetCpuMathLibraryNumThreads(4)
+	t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
+}
\ No newline at end of file
diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go
index 40e51830451..755558f9623 100644
--- a/paddle/fluid/inference/goapi/predictor_test.go
+++ b/paddle/fluid/inference/goapi/predictor_test.go
@@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) {
 	cloned.ClearIntermediateTensor()
 }
 
+func TestONNXRuntimePredictor(t *testing.T) {
+	t.Logf("Version:\n%+v", Version())
+	config := NewConfig()
+	config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams")
+	config.EnableONNXRuntime()
+	config.EnableORTOptimization()
+	predictor := NewPredictor(config)
+	inNames := predictor.GetInputNames()
+	t.Logf("InputNames:%+v", inNames)
+	outNames := predictor.GetOutputNames()
+	t.Logf("OutputNames:%+v", outNames)
+
+	inHandle := predictor.GetInputHandle(inNames[0])
+	inHandle.Reshape([]int32{1, 3, 224, 224})
+	t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape())
+
+	data := make([]float32, numElements([]int32{1, 3, 224, 224}))
+	for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
+		data[i] = float32(i%255) * 0.1
+	}
+	inHandle.CopyFromCpu(data)
+	t.Logf("inHandle Type:%+v", inHandle.Type())
+
+	predictor.Run()
+
+	outHandle := predictor.GetOutputHandle(outNames[0])
+	t.Logf("outHandle name:%+v", outHandle.Name())
+
+	outShape := outHandle.Shape()
+	t.Logf("outHandle Shape:%+v", outShape)
+	outData := make([]float32, numElements(outShape))
+	outHandle.CopyToCpu(outData)
+	t.Log(outData)
+}
+
+
 func TestFromBuffer(t *testing.T) {
 	modelFile, err := os.Open("./mobilenetv1/inference.pdmodel")
 	if err != nil {
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
index edccc2648c0..cff9fd4aa7c 100644
--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -22,6 +22,7 @@ fi
 
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
 
 # 3. go test
 go clean -testcache
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
index df0eb58c2bd..a341ffd7a08 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -81,6 +81,18 @@ TEST(PD_Config, interface) {
   PD_ConfigSetBfloat16Op(config, 1, &ops_name);
 #endif
 
+  PD_ConfigEnableONNXRuntime(config);
+  bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config);
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  EXPECT_TRUE(onnxruntime_enabled);
+#else
+  EXPECT_FALSE(onnxruntime_enabled);
+#endif
+  PD_ConfigDisableONNXRuntime(config);
+  bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config);
+  EXPECT_FALSE(onnxruntime_disabled);
+  PD_ConfigEnableORTOptimization(config);
+
   PD_ConfigEnableMemoryOptim(config, true);
   bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
   EXPECT_TRUE(memory_enabled);
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 9d83f8ff8fd..f376cbd4fb3 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -5,6 +5,7 @@ option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 option(WITH_GTEST "Compile demo with GTEST"   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -172,6 +180,16 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -248,6 +266,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh
index dd4b64f28d7..8123d378500 100755
--- a/paddle/fluid/inference/tests/infer_ut/run.sh
+++ b/paddle/fluid/inference/tests/infer_ut/run.sh
@@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT
-MSVC_STATIC_CRT=$6
+WITH_ONNXRUNTIME=$6
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 EXIT_CODE=0 # init default exit code
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
@@ -144,7 +145,8 @@ function compile_test() {
              -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
              -DWITH_GTEST=ON \
              -DCMAKE_CXX_FLAGS='/std:c++17' \
-             -DCMAKE_BUILD_TYPE=Release
+             -DCMAKE_BUILD_TYPE=Release \
+             -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj
     else
         cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -154,7 +156,8 @@ function compile_test() {
                  -DWITH_STATIC_LIB=OFF \
                  -DUSE_TENSORRT=$USE_TENSORRT \
                  -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-                 -DWITH_GTEST=ON
+                 -DWITH_GTEST=ON \
+                 -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         make -j$(nproc)
     fi;
     cd -
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 05c468b7988..6b6c0cd22f0 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc
 endif()
 set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
 
+if(WITH_ONNXRUNTIME)
+  set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2")
+  if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz)
+    inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz")
+  endif()
+  set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2")
+endif()
+
 function (inference_base_test_build TARGET)
    set(options "")
    set(oneValueArgs "")
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index f40cd51a7b2..6c8fc450cd4 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -80,6 +80,10 @@ set(PYBIND_SRCS
   communication.cc
   cuda_streams_py.cc)
 
+if (WITH_ONNXRUNTIME)
+  set(PYBIND_DEPS ${PYBIND_DEPS} onnxruntime_predictor)
+endif()
+
 if(NOT ON_INFER)
   set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
@@ -152,6 +156,10 @@ if(WITH_PYTHON)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
   endif(WITH_ASCEND_CL)
 
+  if (WITH_ONNXRUNTIME)
+    list(APPEND OP_FUNCTION_GENERETOR_DEPS onnxruntime_predictor)
+  endif()
+
   if(WITH_CNCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
   endif(WITH_CNCL)
@@ -242,6 +250,19 @@ if(WITH_PYTHON)
         list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
         list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
+    if(WITH_ONNXRUNTIME)
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+    endif()
 
     add_custom_command(OUTPUT ${impl_file}
       COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
@@ -260,6 +281,28 @@ if(WITH_PYTHON)
     # copy these *.so to current directory and append current directory to
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
+    if(WITH_ONNXRUNTIME)
+      if (APPLE)
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
+      else()
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
+      endif()
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${PADDLE2ONNX_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${ONNXRUNTIME_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+    endif()
+
     if(WITH_MKLML)
       ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 9b5041154c9..b008308e27d 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -33,6 +33,10 @@
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 namespace py = pybind11;
 
 namespace pybind11 {
@@ -556,6 +560,10 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
+      .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
+      .def("onnxruntime_enabled", &AnalysisConfig::use_onnxruntime)
+      .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
       .def("use_npu", &AnalysisConfig::use_npu)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 175b4be295e..84f7a57999f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -242,6 +242,7 @@ function cmake_base() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} 
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF}
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}"
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -293,7 +294,9 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}" \
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} \
-        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
+        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}  \
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
+        
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -2504,7 +2507,8 @@ EOF
     fi
     startTime_s=`date +%s`
     set +e
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON};build_error=$?
+
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
@@ -2548,7 +2552,7 @@ EOF
     demo_ci_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr}
+             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     DEMO_EXIT_CODE=$?
     ./clean.sh
     demo_ci_endTime_s=`date +%s`
@@ -2558,7 +2562,7 @@ EOF
     infer_ut_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/tests/infer_ut
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_ROOT_DIR:-/usr}
+             ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     TEST_EXIT_CODE=$?
     infer_ut_endTime_s=`date +%s`
     echo "infer_ut tests Total time: $[ $infer_ut_endTime_s - $infer_ut_startTime_s ]s"
diff --git a/python/setup.py.in b/python/setup.py.in
index 3ce22892b6e..689f63c0f00 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -505,6 +505,18 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_ONNXRUNTIME}' == 'ON':
+    shutil.copy('${ONNXRUNTIME_SHARED_LIB}', libs_path)
+    if os.name == 'nt':
+        shutil.copy('${PADDLE2ONNX_SHARED_LIB}', libs_path)
+        package_data['paddle.libs']+=['paddle2onnx.dll', 'onnxruntime.dll']
+    else:
+        shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
+        if sys.platform == 'darwin':
+            package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
+        else:
+            package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
+
 if '${WITH_XPU}' == 'ON':
     # only change rpath in Release mode,
     if '${CMAKE_BUILD_TYPE}' == 'Release':
-- 
GitLab


From a40ea45e50d91f3bb13b3e5121ed28dc3b13c544 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Thu, 10 Mar 2022 23:38:34 +0800
Subject: [PATCH 244/272] [phi] transfer the infer shape of accuracy op into
 phi (#40358)

* transfer the infershape of accuracy op into phi

* add set_dtype

* add setdtype
---
 paddle/fluid/operators/metrics/accuracy_op.cc | 70 ++-----------------
 paddle/phi/infermeta/ternary.cc               | 52 ++++++++++++++
 paddle/phi/infermeta/ternary.h                |  8 +++
 3 files changed, 66 insertions(+), 64 deletions(-)

diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 056620db5b9..32ef0521198 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::NotFound("Input (Out) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true,
-                      platform::errors::NotFound(
-                          "Input (Indices) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true,
-                      platform::errors::NotFound(
-                          "Input (Label) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true,
-                      platform::errors::NotFound(
-                          "Output (Accuracy) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true,
-                      platform::errors::NotFound(
-                          "Output (Correct) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true,
-                      platform::errors::NotFound(
-                          "Output (Total) of AccuracyOp is not found."));
-
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy",
-                   "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy");
-
-    auto inference_dim = ctx->GetInputDim("Out");
-    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape as inference, because
-    // it's the output of topk.
-
-    PADDLE_ENFORCE_EQ(
-        label_dim.size(), 2,
-        platform::errors::InvalidArgument(
-            "ShapeError: label's dimensions of AccuracyOp must be 2. "
-            "But received label's dimensions = %d, label's shape = [%s]",
-            label_dim.size(), label_dim));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: label's second dimension of "
-                            "AccuracyOp must be 1. But received label's "
-                            "second dimension is = %d, label's shape = [%s]",
-                            label_dim[1], label_dim));
-      PADDLE_ENFORCE_EQ(
-          inference_dim[0], label_dim[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: the output's num_rows of AccuracyOp must be"
-              " the same as label's num_rows. But received output's "
-              "shape = [%s], label's shape = [%s], output's num_rows = %d, "
-              "label's "
-              "num_rows = %d",
-              inference_dim, label_dim, inference_dim[0], label_dim[0]));
-    }
-
-    ctx->SetOutputDim("Accuracy", {1});
-    ctx->SetOutputDim("Correct", {1});
-    ctx->SetOutputDim("Total", {1});
-    ctx->ShareLoD("Out", /*->*/ "Accuracy");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -125,8 +64,11 @@ with the input Out(Inference).
 
 // FIXME(typhoonzero): types of T is for infernece data.
 // label data is always int.
+DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor,
+                            PD_INFER_META(phi::AccuracyInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AccuracyInferShapeFunctor);
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 8baf3d7ed96..813c7243b33 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -285,6 +285,58 @@ void LinspaceInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config) {
+  auto inference_dim = out.dims();
+  auto label_dim = label.dims();
+  // Assume indices has same shape as inference, because
+  // it's the output of topk.
+  PADDLE_ENFORCE_EQ(
+      label_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: label's dimensions of AccuracyOp must be 2. "
+          "But received label's dimensions = %d, label's shape = [%s]",
+          label_dim.size(),
+          label_dim));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(label_dim[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: label's second dimension of "
+                          "AccuracyOp must be 1. But received label's "
+                          "second dimension is = %d, label's shape = [%s]",
+                          label_dim[1],
+                          label_dim));
+    PADDLE_ENFORCE_EQ(
+        inference_dim[0],
+        label_dim[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: the output's num_rows of AccuracyOp must be"
+            " the same as label's num_rows. But received output's "
+            "shape = [%s], label's shape = [%s], output's num_rows = %d, "
+            "label's "
+            "num_rows = %d",
+            inference_dim,
+            label_dim,
+            inference_dim[0],
+            label_dim[0]));
+  }
+
+  accuracy->set_dims({1});
+  accuracy->set_dtype(out.dtype());
+  correct->set_dims({1});
+  correct->set_dtype(out.dtype());
+  total->set_dims({1});
+  total->set_dtype(out.dtype());
+  accuracy->share_lod(out);
+}
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index b54460bc9f6..2ccba1b89fc 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -29,6 +29,14 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config = MetaConfig());
 
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
-- 
GitLab


From 9ebe72767da0504cd20f5675ad459baa85e5258e Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 11 Mar 2022 09:19:07 +0800
Subject: [PATCH 245/272] Fixed issues with intermediate kernels (#40266)

* Fix issues with intermediate kernels

* Fixed CI issues
---
 .../final_state_generator/eager_gen.py             | 14 ++++++++++----
 .../final_state_generator/python_c_gen.py          |  1 +
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 4f6f437163a..967891fe522 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -712,18 +712,24 @@ def GenerateNodeCreationCodes(
 
     # SetTensorWrappers
     set_tensor_wrappers_list = []
-    for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+    for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
         is_optional = (name in optional_inputs)
+
         if is_fwd_input:
             if is_optional:
                 set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
             else:
                 set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
+            if IsVectorTensorType(atype):
+                tw_name = f"api_result[{pos}]"
+            else:
+                tw_name = f"api_result"
+
             if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -1040,12 +1046,12 @@ def GenerateNodeHFile(filepath, node_declaration_str):
 
 def GenerateForwardCCFile(filepath, forward_definition_str):
     file_contents = """
+#include "paddle/phi/api/lib/dygraph_api.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
-
 """
 
     file_contents += GenerateCoreOpInfoDefinition()
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index abf3f86bdb0..eee32a2c505 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -222,6 +222,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
 
 #include  "pybind11/detail/common.h"
 #include  "paddle/phi/api/all.h"
+#include  "paddle/phi/api/lib/dygraph_api.h"
 #include  "paddle/phi/common/backend.h"
 #include  "paddle/phi/common/data_type.h"
 #include  "paddle/phi/common/scalar.h"
-- 
GitLab


From d35b5b585278025d8f732ca8244157cea70df5f3 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Fri, 11 Mar 2022 10:10:52 +0800
Subject: [PATCH 246/272] [phi] [infershape] transfer nll_loss infer shape into
 phi (#40375)

* transfer nll_loss infershape into phi
---
 paddle/fluid/operators/nll_loss_op.cc | 78 ++------------------------
 paddle/phi/infermeta/ternary.cc       | 80 +++++++++++++++++++++++++++
 paddle/phi/infermeta/ternary.h        |  9 +++
 3 files changed, 95 insertions(+), 72 deletions(-)

diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index 6c35ad29e97..a4e1f7b3091 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight",
-                   "NLLLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true,
-                      platform::errors::InvalidArgument(
-                          "The tensor rank of Input(X) must be 2 or 4."));
-    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) ||
-                               phi::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], label_dims[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: Expected input batch_size to match label batch_size,"
-              "But received: the Input(x) batch_size is [%s], the Input(label) "
-              " batch_size is [%s].",
-              x_dims[0], label_dims[0]));
-      if (ctx->HasInput("Weight")) {
-        auto w_dims = ctx->GetInputDim("Weight");
-        PADDLE_ENFORCE_EQ(w_dims.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "Input(Weight) should be a 1D tensor."));
-        PADDLE_ENFORCE_EQ(
-            x_dims[1], w_dims[0],
-            platform::errors::InvalidArgument(
-                "Expected input tensor Weight's size should equal "
-                "to the first dimension of the input tensor X. But received "
-                "Weight's "
-                "size is %d, the first dimension of input X is %d",
-                w_dims[0], x_dims[1]));
-      }
-    }
-    if (x_dims.size() == 2) {
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    } else if (x_dims.size() == 4) {
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        platform::errors::InvalidArgument(
-                            "Expected Input(Lable) dimensions=3, received %d.",
-                            label_dims.size()));
-      auto input0 = x_dims[0];
-      auto input2 = x_dims[2];
-      auto input3 = x_dims[3];
-      auto label0 = label_dims[0];
-      auto label1 = label_dims[1];
-      auto label2 = label_dims[2];
-      PADDLE_ENFORCE_EQ(
-          input0 == label0 && input2 == label1 && input3 == label2, true,
-          platform::errors::InvalidArgument("Input(X) tensor shape should "
-                                            "match to Input(Label) tensor "
-                                            "shape."));
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    }
-    ctx->SetOutputDim("Total_weight", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -259,8 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor,
+                            PD_INFER_META(phi::NllLossRawInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker,
                   ops::NLLLossGradMaker<paddle::framework::OpDesc>,
-                  ops::NLLLossGradMaker<paddle::imperative::OpBase>);
+                  ops::NLLLossGradMaker<paddle::imperative::OpBase>,
+                  NllLossRawInferShapeFunctor);
 REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp);
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 813c7243b33..88ac2cb0f8d 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -89,6 +89,86 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config) {
+  auto x_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The tensor rank of Input(X) must be 2 or 4."));
+  bool contain_unknown_dim =
+      phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(label_dims);
+  bool check = config.is_runtime || !contain_unknown_dim;
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        x_dims[0],
+        label_dims[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: Expected input batch_size to match label batch_size,"
+            "But received: the Input(x) batch_size is [%s], the Input(label) "
+            " batch_size is [%s].",
+            x_dims[0],
+            label_dims[0]));
+    if (weight.get_ptr() != nullptr) {
+      auto w_dims = weight->dims();
+      PADDLE_ENFORCE_EQ(
+          w_dims.size(),
+          1,
+          phi::errors::InvalidArgument("Input(Weight) should be a 1D tensor."));
+      PADDLE_ENFORCE_EQ(
+          x_dims[1],
+          w_dims[0],
+          phi::errors::InvalidArgument(
+              "Expected input tensor Weight's size should equal "
+              "to the first dimension of the input tensor X. But received "
+              "Weight's "
+              "size is %d, the first dimension of input X is %d",
+              w_dims[0],
+              x_dims[1]));
+    }
+  }
+  if (x_dims.size() == 2) {
+    if (reduction == "none") {
+      out->set_dims({x_dims[0]});
+    } else {
+      out->set_dims({1});
+    }
+  } else if (x_dims.size() == 4) {
+    PADDLE_ENFORCE_EQ(label_dims.size(),
+                      3,
+                      phi::errors::InvalidArgument(
+                          "Expected Input(Lable) dimensions=3, received %d.",
+                          label_dims.size()));
+    auto input0 = x_dims[0];
+    auto input2 = x_dims[2];
+    auto input3 = x_dims[3];
+    auto label0 = label_dims[0];
+    auto label1 = label_dims[1];
+    auto label2 = label_dims[2];
+    PADDLE_ENFORCE_EQ(
+        input0 == label0 && input2 == label1 && input3 == label2,
+        true,
+        phi::errors::InvalidArgument("Input(X) tensor shape should "
+                                     "match to Input(Label) tensor "
+                                     "shape."));
+    if (reduction == "none") {
+      out->set_dims({x_dims[0], x_dims[2], x_dims[3]});
+    } else {
+      out->set_dims({1});
+    }
+  }
+  total_weight->set_dims({1});
+  out->set_dtype(input.dtype());
+  total_weight->set_dtype(input.dtype());
+}
+
 void ScatterInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       const MetaTensor& updates,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 2ccba1b89fc..c9a7e78db75 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -56,6 +56,15 @@ void ScatterInferMeta(const MetaTensor& x,
                       bool overwrite,
                       MetaTensor* out);
 
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config = MetaConfig());
+
 void ScatterNdAddInferMeta(const MetaTensor& x,
                            const MetaTensor& index,
                            const MetaTensor& updates,
-- 
GitLab


From f39625305a9b26c2fa23e120e279faf2f3b83f71 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Fri, 11 Mar 2022 10:11:17 +0800
Subject: [PATCH 247/272] [phi] transfer pad infer shape function into phi
 infer meta  (#40158)

* pad infershape

* fix code

* fix

* add set dtype
---
 paddle/fluid/operators/pad_op.cc | 38 +++++------------------------
 paddle/phi/infermeta/unary.cc    | 41 ++++++++++++++++++++++++++++++++
 paddle/phi/infermeta/unary.h     |  6 +++++
 3 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 229e61ac9fe..dc162ae5782 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE_EQ(
-        static_cast<int>(paddings.size()), x_dim.size() * 2,
-        platform::errors::InvalidArgument(
-            "Size of 'paddings' dimension should be equal to 2 * size of "
-            "Input(X)'s dimension, but received (size of 'paddings' dimension "
-            "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
-            static_cast<int>(paddings.size()), x_dim.size() * 2));
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      PADDLE_ENFORCE_GE(paddings[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The element of 'paddings' should >= 0, but "
-                            "received %d for index %d.",
-                            paddings[i], static_cast<int>(i)));
-    }
-    std::vector<int64_t> out_dims(x_dim.size());
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
-        out_dims[i] = -1;
-      } else {
-        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
   }
 };
 
@@ -160,10 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor,
+                            PD_INFER_META(phi::PadInferMeta));
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
                   ops::PadOpGradMaker<paddle::framework::OpDesc>,
-                  ops::PadOpGradMaker<paddle::imperative::OpBase>);
+                  ops::PadOpGradMaker<paddle::imperative::OpBase>,
+                  PadInferShapeFunctor);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
                   ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index af035004e4b..d7e2bc1767a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1124,6 +1124,47 @@ void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
   out->set_dims({1});
 }
 
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config) {
+  auto x_dim = input.dims();
+  PADDLE_ENFORCE_EQ(
+      static_cast<int>(paddings.size()),
+      x_dim.size() * 2,
+      phi::errors::InvalidArgument(
+          "Size of 'paddings' dimension should be equal to 2 * size of "
+          "Input(X)'s dimension, but received (size of 'paddings' dimension "
+          "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
+          static_cast<int>(paddings.size()),
+          x_dim.size() * 2));
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_GE(paddings[i],
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The element of 'paddings' should >= 0, but "
+                          "received %d for index %d.",
+                          paddings[i],
+                          static_cast<int>(i)));
+  }
+  std::vector<int64_t> out_dims(x_dim.size());
+  for (int i = 0; i < x_dim.size(); ++i) {
+    if ((!config.is_runtime) && (x_dim[i] == -1)) {
+      out_dims[i] = -1;
+    } else {
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+  if (out_dims[0] == x_dim[0]) {
+    // Only pass LoD when the first dimension is equal between
+    // output and input.
+    out->share_lod(input);
+  }
+  out->set_dtype(input.dtype());
+}
+
 void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(DataType::BOOL);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index bd79bf9d6ed..a3e5628a4d7 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -163,6 +163,12 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
 
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
 
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config = MetaConfig());
+
 void DiagonalInferMeta(
     const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
 
-- 
GitLab


From 8aba826faa2b0afc8daac16a1b2460266d36f7b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 11 Mar 2022 11:00:52 +0800
Subject: [PATCH 248/272] [infrt] add phi ir test to infrt-exec. test=develop
 (#40384)

---
 paddle/infrt/api/infrt_api.cc                 |  4 ++--
 paddle/infrt/dialect/basic_kernels.td         | 14 +-----------
 paddle/infrt/dialect/dense_tensor.cc          | 17 --------------
 paddle/infrt/dialect/dense_tensor.h           | 21 ------------------
 paddle/infrt/dialect/dense_tensor.td          | 11 +++++-----
 paddle/infrt/dialect/infrt/infrt_ops.td       |  3 +--
 paddle/infrt/dialect/infrt/infrt_ops_base.td  |  6 +++++
 paddle/infrt/dialect/infrt_base.cc            | 21 ------------------
 paddle/infrt/dialect/infrt_base.td            |  9 --------
 paddle/infrt/dialect/mlir_loader.cc           |  1 +
 .../infrt/dialect/tensorrt/pd_lower_to_trt.td |  2 +-
 paddle/infrt/host_context/mlir_exec.cc        | 22 ++++++++++++++++++-
 paddle/infrt/host_context/paddle_mlir.cc      |  2 +-
 paddle/infrt/host_context/value.cc            |  4 ++++
 paddle/infrt/kernel/tensor_kernels.cc         |  4 ++--
 .../tests/dialect/disabled_tensor_map.mlir    |  3 +--
 paddle/infrt/tests/dialect/phi/phi_test.mlir  | 15 +++++++++++++
 .../tests/dialect/tensor/tensor_map.mlir.in   |  3 +--
 paddle/scripts/infrt_build.sh                 |  2 +-
 19 files changed, 63 insertions(+), 101 deletions(-)
 create mode 100644 paddle/infrt/tests/dialect/phi/phi_test.mlir

diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 28f63db49f4..e0488117783 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -24,6 +24,7 @@
 
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -41,7 +42,6 @@
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using namespace infrt::tensor;        // NOLINT
-using infrt::dt::TensorMapType;       // NOLINT
 
 namespace infrt {
 
@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
       auto arg = predict_func.getArgument(i);
       auto type = arg.getType();
       // this param is TensorMap
-      if (type.isa<TensorMapType>()) {
+      if (type.isa<infrt::DenseTensorMapType>()) {
         auto* value = new host_context::Value(std::move(*map));
         arguments_.push_back(value);
         AddValue(predict_func.getArgument(i), value);
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index aadc146e362..89d8cd65b85 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -111,25 +111,13 @@ def PrintI64Op : PrintOp<"i64", I64>;
 def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
-def GetStringOp : INFRT_Op<"get_string"> {
-  let summary = "Infrt.get_string";
-  let description = [{
-    Get a !infrt.string value from the given string attribute.
-  }];
-
-  let arguments = (ins StrAttr:$value);
-  let results = (outs StringType);
-  let assemblyFormat = "`(` $value `)` attr-dict";
-  let verifier = ?;
-}
-
 def PrintStringOp : INFRT_Op<"print_string"> {
   let summary = "Infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
 
-  let arguments = (ins StringType:$input);
+  let arguments = (ins StrAttr:$input);
   let results = (outs);
   let assemblyFormat = "`(` $input `)` attr-dict";
   let verifier = ?;
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 49d6887ada0..7b8d48ff716 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -38,23 +38,6 @@ void DTDialect::initialize() {
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
-
-TensorMapType TensorMapType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-TensorMapType TensorMapType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
-StringType StringType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-StringType StringType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
 static mlir::Type getTensorType(mlir::MLIRContext *context) {
   auto t_dialect = mlir::Identifier::get("t", context);
   return mlir::OpaqueType::get(t_dialect, "tensor");
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index b0a1ea412c5..27febffe815 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -21,27 +21,6 @@
 
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
-namespace infrt {
-namespace dt {
-class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
-                                                  mlir::Type,
-                                                  mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static TensorMapType get();
-  static TensorMapType get(mlir::MLIRContext *context);
-};
-
-class StringType
-    : public mlir::Type::TypeBase<StringType, mlir::Type, mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static StringType get();
-  static StringType get(mlir::MLIRContext *context);
-};
-}  // namespace dt
-}  // namespace infrt
-
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 7e6e838a723..f5db90648ee 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -105,11 +105,10 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
   }];
 
   // input path of model params.
-  let arguments = (ins StringType:$path);
-  let results = (outs TensorMapType);
+  let arguments = (ins StrAttr:$path);
+  let results = (outs DenseTensorMap:$out);
 
-  let assemblyFormat = "`(` operands `)` attr-dict";
-  let verifier = ?;
+  let assemblyFormat = "`(``)`attr-dict";
 }
 
 
@@ -122,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 
   // input path of model params.
   let arguments = (ins
-          TensorMapType:$map,
+          DenseTensorMap:$map,
           StrAttr:$name
           );
   let results = (outs DenseTensor:$output);
@@ -137,7 +136,7 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
     An operation that get the size of a TensorMap.
   }];
 
-  let arguments = (ins TensorMapType:$map);
+  let arguments = (ins DenseTensorMap:$map);
   let results = (outs I32:$size);
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index e07a598d9bc..16ade66d47b 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -30,8 +30,7 @@ def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
 
   let arguments = (ins Variadic<AnyType>:$operands);
 
-  let builders = [OpBuilder<(ins),
-                  [{ build($_builder, $_state, llvm::None); }]>];
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
 }
 
 def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index 8a6eb766567..3190c1c84b8 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -83,6 +83,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   );
 }
 
+def DenseTensorMap :  Infrt_Type<"DenseTensorMap"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
 // Type Constrait for concrete DenseTensor type.
 class DenseTensor<string target, string precision, string layout> :
     Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index 8c595c06745..e951762abb2 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -25,10 +25,6 @@ namespace dialect {
 void INFRTDialect::initialize() {
   allowUnknownTypes();
   allowUnknownOperations();
-
-  addTypes<infrt::dt::StringType>();
-  addTypes<infrt::dt::TensorMapType>();
-
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/basic_kernels.cpp.inc"
@@ -43,14 +39,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
   llvm::StringRef keyword;
   if (parser.parseKeyword(&keyword)) return mlir::Type();
   // parse TensorMapType, for example: !infrt.tensor_map
-  if (keyword == "tensor_map") {
-    return infrt::dt::TensorMapType::get();
-  }
-  // parse StringType, for example: !infrt.string
-  if (keyword == "string") {
-    return infrt::dt::StringType::get();
-  }
-
   parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
       << keyword;
   return mlir::Type();
@@ -59,15 +47,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
 void INFRTDialect::printType(mlir::Type type,
                              mlir::DialectAsmPrinter &printer) const {
   // print TensorMapType, for example: !infrt.tensor_map
-  if (type.isa<infrt::dt::TensorMapType>()) {
-    printer << "tensor_map";
-    return;
-  }
-  // print StringType, for example: !infrt.string
-  if (type.isa<infrt::dt::StringType>()) {
-    printer << "string";
-    return;
-  }
   llvm_unreachable("unknown infrt type.");
 }
 
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 0f50eb2d8fb..45e6b116f48 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -14,15 +14,6 @@ def INFRT_Dialect : Dialect {
   let cppNamespace = "::infrt::dialect";
 }
 
-// Type definitions
-def StringType :
-    Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
-    BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
-
-def TensorMapType :
-    Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
-    BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
-
 def BufferType : OpaqueType<"b", "buffer", "buffer">;
 
 class INFRT_createI32Attr<string value> : NativeCodeCall<
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 1d0696e77dc..e9bfc2eddb7 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -63,6 +63,7 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   mlir::DialectRegistry registry;
   registerCinnDialects(registry);
   context->appendDialectRegistry(registry);
+  context->loadAllAvailableDialects();
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 701391a7503..68ca1559ace 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -7,7 +7,7 @@ include "paddle/infrt/dialect/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
-        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">, ConstantAttr<SI32Attr, "1">),
+        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">),
         (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>;
 
 //TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 7823681079f..90bcb1df220 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <llvm/Support/CommandLine.h>
-
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
 
@@ -29,6 +29,8 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
 #endif
@@ -81,6 +83,24 @@ int main(int argc, char** argv) {
     }
   }
 
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+#ifdef INFRT_WITH_PHI
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+#endif
+
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+
   host_context::TestMlir(module.get(), &registry);
 
   std::cout << std::endl;
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 83a2a4269c3..6afef5935c7 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -79,7 +79,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
 llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
     const infrt::paddle::framework_proto::ProgramDesc &program) {
   llvm::SmallVector<mlir::Type, 4> operandTypes;
-  operandTypes.push_back(infrt::dt::TensorMapType::get(context_));
+  operandTypes.push_back(infrt::DenseTensorMapType::get(context_));
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() != "feed") continue;
     for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index abf0b8a9577..822ee108c89 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -59,6 +59,10 @@ void CopyTo(const Value& from, Value* to) {
           to->data = reinterpret_cast<std::vector<int64_t> const&>(arg);
         else if (std::is_same<T, tensor::TensorMap>::value)
           to->data = reinterpret_cast<tensor::TensorMap const&>(arg);
+#ifdef INFRT_WITH_PHI
+        else if (std::is_same<T, ::phi::DenseTensor>::value)
+          to->data = reinterpret_cast<::phi::DenseTensor const&>(arg);
+#endif
         else
           LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
       },
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 9de1350e97d..d5922af9ada 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -49,8 +49,8 @@ void FillTensorWithConstant(Attribute<T> v, DenseHostTensor *tensor) {
   MutableDTArrayView<T>(tensor).Fill(v.get());
 }
 
-TensorMap LoadParams(const std::string &path) {
-  return *(infrt::tensor::LoadParams(path));
+TensorMap LoadParams(Attribute<std::string> path) {
+  return *(infrt::tensor::LoadParams(path.get()));
 }
 
 DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 8e2d3bc49b9..1cae065bd5f 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -19,9 +19,8 @@ func @main() {
   %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
   dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model")
   // CHECK-LABEL: loading params
-  %map = dt.load_params(%path)
+  %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"}
 
   %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
   dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
new file mode 100644
index 00000000000..923f4e9d9d2
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -0,0 +1,15 @@
+// RUN: infrtexec -i %s
+module  {
+  func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
+    %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    Infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
+  }
+  func @main() {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+    %2 = Infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    Infrt.return
+  }
+}
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 5c1396d47f5..28450ed6bd8 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -1,8 +1,7 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
 func @load_tensor_map() {
-  %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
-  %map = dt.load_params(%path)
+  %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
   %size = dt.tensor_map_get_size(%map) -> i32
   Infrt.print.i32 %size
 
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 0ba2dae9096..76b45ff89f1 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -32,7 +32,7 @@ function update_pd_ops() {
    # compile and install paddle
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-   cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
+   cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
    make -j8 paddle_python print_pten_kernels kernel_signature_generator
    cd ${PADDLE_ROOT}/build
    ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
-- 
GitLab


From 135cf713c6e9697f83e3eb36a0919c50f877b8c0 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Fri, 11 Mar 2022 11:20:18 +0800
Subject: [PATCH 249/272] Remove unnecessary vlog (#40430)

---
 paddle/fluid/distributed/store/tcp_store.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index eb98c89c99e..b0d5add4956 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -136,10 +136,6 @@ void MasterDaemon::run() {
     }
 
     for (size_t i = 1; i < fds.size(); i++) {
-      VLOG(0) << "fds.size:" << fds.size();
-      VLOG(0) << "fds.size-i:" << i;
-      VLOG(0) << "fds[i].revents:" << fds[i].revents;
-
       try {
         if (fds[i].revents == 0) {
           continue;
-- 
GitLab


From bd2d4fd080fb2e72cbba57bc9862a5c10cb69077 Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Fri, 11 Mar 2022 11:40:36 +0800
Subject: [PATCH 250/272] fix_import_distribute_bugs (#40396)

---
 .../sharding_optimizer_stage2.py              | 21 +++++++-----
 .../meta_parallel/sharding/sharding_stage2.py | 13 ++++++-
 .../meta_parallel/sharding/sharding_stage3.py | 18 +++++++---
 .../meta_parallel/sharding/sharding_utils.py  |  1 -
 .../unittests/dygraph_sharding_stage2.py      | 32 +++++++++++++++--
 .../unittests/dygraph_sharding_stage3.py      | 34 ++++++++++++++++---
 6 files changed, 99 insertions(+), 20 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index a31f8bbfed0..a2c741667ed 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -25,10 +25,9 @@ from collections import OrderedDict
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-import paddle.distributed as dist
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
-from paddle.distributed.collective import _get_global_group
+from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait
 
 from ...utils.internal_storage import ParamStorage, GradStorage
 from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
@@ -91,8 +90,8 @@ class ShardingOptimizerStage2(Optimizer):
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
-        self.group = dist.new_group(_get_global_group()
-                                    .ranks) if group is None else group
+        self.group = new_group(_get_global_group()
+                               .ranks) if group is None else group
 
         self.world_size = self.group.nranks
         self.rank = self.group.rank
@@ -141,14 +140,14 @@ class ShardingOptimizerStage2(Optimizer):
         """
 
         for p in self._local_params:
-            dist.broadcast(
+            broadcast(
                 p,
                 src=self._global_root_rank,
                 group=self.group,
                 use_calc_stream=True)
 
         # Multi stream operation will be supported later
-        dist.wait(tensor=p, group=self.group, use_calc_stream=True)
+        wait(tensor=p, group=self.group, use_calc_stream=True)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -385,6 +384,12 @@ class ShardingOptimizerStage2(Optimizer):
         raise RuntimeError(
             "optimizer.minimize() not support now, please use optimizer.step()")
 
+    def set_state_dict(self, state_dict):
+        self._optim.set_state_dict(state_dict)
+
+    def state_dict(self):
+        return self._optim.state_dict()
+
     def _clear_cache(self):
         self.__segment_params.clear()
         self._dtype_rank_params.clear()
@@ -399,14 +404,14 @@ class ShardingOptimizerStage2(Optimizer):
         # Exchange all the shards with the other ranks
         for dtype_per_rank in self.param_storages.values():
             for dst_rank, internal_storage in dtype_per_rank.items():
-                dist.broadcast(
+                broadcast(
                     tensor=internal_storage.buffer,
                     src=self.group.ranks[dst_rank],
                     group=self.group,
                     use_calc_stream=True)
 
             # Multi stream operation will be supported later
-            dist.wait(
+            wait(
                 tensor=internal_storage.buffer,
                 group=self.group,
                 use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 548f036067e..c6f05023e61 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -28,7 +28,7 @@ from types import MethodType
 
 import paddle
 from paddle import nn
-import paddle.distributed as dist
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from ...utils.internal_storage import GradStorage
@@ -158,6 +158,17 @@ class ShardingStage2(nn.Layer):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _clear_gradients(self):
         """
         Set zero to the gradient of the optimizer's current rank trainable parameters.
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index bcf63a54cc4..9886ca4e2de 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -20,7 +20,6 @@ import logging
 import functools
 import numpy as np
 from itertools import chain
-from functools import reduce
 from types import MethodType
 from collections import deque, OrderedDict
 
@@ -28,9 +27,9 @@ import paddle
 from paddle import nn
 from paddle.autograd import PyLayer
 import paddle.fluid.core as core
-import paddle.distributed as dist
 from paddle.fluid.framework import ParamBase
 from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from .sharding_utils import Type, ShardingClipGrad, device_guard
@@ -249,6 +248,17 @@ class ShardingStage3(nn.Layer):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _handle_unslice_params(self):
         buffer_size = dict()
         buffer_size[Type.fp32.value] = 0
@@ -523,7 +533,7 @@ class ShardingStage3(nn.Layer):
 
     def _get_allreduce_fn(self, param):
         @paddle.autograd.no_grad()
-        def reduce(*_):
+        def allreduce_(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
                 # Only support sync allreduce current rank's layer now
@@ -573,7 +583,7 @@ class ShardingStage3(nn.Layer):
                     if self._offload:
                         param.fw_storage = _device2cpu(param.fw_storage, True)
 
-        return reduce
+        return allreduce_
 
     def _param2align(self, param):
         # CUDA alignment 256 bytes
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 0a42b993d5b..89b59254e5b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -21,7 +21,6 @@ import numpy as np
 from types import MethodType
 
 import paddle
-import paddle.distributed as dist
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 06935e212c3..fb01fd46c0d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -14,8 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
 import numpy as np
 import argparse
+import tempfile
 import ast
 import time
 import paddle
@@ -88,7 +91,8 @@ def train_mlp(model,
               batch_size=100,
               use_pure_fp16=False,
               accumulate_grad=False,
-              opt_group=False):
+              opt_group=False,
+              save_model=False):
     if sharding_stage == "dp":
         hcg = fleet.get_hybrid_communicate_group()
         group = hcg.get_check_parallel_group()
@@ -147,6 +151,9 @@ def train_mlp(model,
         if accumulate_grad:
             optimizer.step()
             optimizer.clear_grad()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
@@ -158,11 +165,13 @@ def test_dp_stage2():
     mlp3 = MLP()
     mlp4 = MLP()
     mlp5 = MLP()
+    mlp6 = MLP()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
     mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
 
     # DP VS stage2
     dp_params = train_mlp(
@@ -186,10 +195,29 @@ def test_dp_stage2():
 
     # stage2 param list VS param group
     stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+        mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True)
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
             dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage2, optimizer_stage2 = train_mlp(
+        mlp6,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage2.state_dict(), model_file)
+    paddle.save(optimizer_stage2.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage2.set_state_dict(m_state_dict)
+    optimizer_stage2.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index bbbcb621fd4..82821cd7ee6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -14,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
+import tempfile
 import numpy as np
 import argparse
 import ast
@@ -84,7 +87,8 @@ def train_mlp(model,
               batch_size=100,
               opt_group=False,
               sync_comm=False,
-              test_minimize=False):
+              test_minimize=False,
+              save_model=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
         optimizer = optimizer_setting(
@@ -162,12 +166,15 @@ def train_mlp(model,
             optimizer.clear_grad()
     if sharding_stage == 3:
         model.get_all_parameters()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
 def test_stage2_stage3():
-    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9 = MLP(), MLP(
-    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
     state_dict = mlp.state_dict()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
@@ -178,6 +185,7 @@ def test_stage2_stage3():
     mlp7.set_state_dict(state_dict)
     mlp8.set_state_dict(state_dict)
     mlp9.set_state_dict(state_dict)
+    mlp10.set_state_dict(state_dict)
 
     # fp32 
     stage2_params = train_mlp(
@@ -238,9 +246,27 @@ def test_stage2_stage3():
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
 
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage3, optimizer_stage3 = train_mlp(
+        mlp9,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage3.state_dict(), model_file)
+    paddle.save(optimizer_stage3.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage3.set_state_dict(m_state_dict)
+    optimizer_stage3.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     # check optimizer.minimize() error
     train_mlp(
-        mlp9,
+        mlp10,
         sharding_stage=3,
         use_pure_fp16=False,
         opt_group=False,
-- 
GitLab


From 1593c7ca2ad4c757da69581cb99f5d6250df2263 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 11 Mar 2022 12:55:54 +0800
Subject: [PATCH 251/272] [Phi] Fix infershape if encounter TensorList and
 Attr("XXX") (#40420)

* [Phi] Fix infershape if encounter TensorList and Attr("XXX")

* add InferShapeArgumentMappingContext
---
 paddle/fluid/framework/infershape_utils.cc   |  2 ++
 paddle/phi/core/compat/arg_map_context.h     |  7 +++++++
 paddle/phi/ops/compat/gaussian_random_sig.cc | 19 ++++++++++++++-----
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 91ef59575c3..29c7f5d0ce7 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -90,6 +90,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
 
   bool IsForInferShape() const override { return true; }
 
+  bool IsRuntime() const override { return ctx_.IsRuntime(); }
+
  private:
   const InferShapeContext& ctx_;
 };
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 688a0e54a0c..25b80279ecf 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -96,6 +96,13 @@ class ArgumentMappingContext {
   // use this function to mark it comes from InferShapeArgumentMappingContext
   // and will be used in infershape
   virtual bool IsForInferShape() const = 0;
+
+  // NOTE(paddle-dev): [ Why do we export this interface? ]
+  // In old Fluid framework, some operators' Attribute can be a Tensor or
+  // TensorList. In this case, the InferShape logic will be different
+  // under CompileTime and RuntimeTime. So we export this interface to
+  // handle it conveniently. See "gaussian_random_sig.cc" for details.
+  virtual bool IsRuntime() const { return true; }
 };
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc
index cddcb80ebea..2f2b157e4c0 100644
--- a/paddle/phi/ops/compat/gaussian_random_sig.cc
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
@@ -18,14 +18,23 @@ namespace phi {
 
 KernelSignature GaussianRandomOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
   if (ctx.InputSize("ShapeTensorList") > 0) {
-    return KernelSignature("gaussian_random",
-                           {},
-                           {"ShapeTensorList", "mean", "std", "seed", "dtype"},
-                           {"Out"});
+    // Infer output shape by Attr("shape") in CompileTime if it is specified.
+    if (!ctx.IsRuntime() && !shape.empty()) {
+      return KernelSignature("gaussian_random",
+                             {},
+                             {"shape", "mean", "std", "seed", "dtype"},
+                             {"Out"});
+    } else {
+      return KernelSignature(
+          "gaussian_random",
+          {},
+          {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+          {"Out"});
+    }
   }
 
-  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
   if (ctx.HasInput("ShapeTensor") && shape.empty()) {
     return KernelSignature("gaussian_random",
                            {},
-- 
GitLab


From 82c30f71545acace2798dee1c17aa5086c1b5d0a Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Fri, 11 Mar 2022 13:52:01 +0800
Subject: [PATCH 252/272] add EMD method of post_quant (#40421)

---
 .../post_training_quantization.py             | 57 ++++++++++++++++---
 .../test_post_training_quantization_mnist.py  | 20 +++++++
 ..._post_training_quantization_mobilenetv1.py | 22 +++++++
 3 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 9da798375af..97b4116826a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -272,7 +272,7 @@ class PostTrainingQuantization(object):
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
         self._support_algo_type = [
-            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+            'KL', 'hist', 'avg', 'mse', 'emd', 'abs_max', 'min_max'
         ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
@@ -349,7 +349,7 @@ class PostTrainingQuantization(object):
         # The vars for algo = avg
         self._quantized_var_avg = {}
         # The best loss of algo = mse
-        self._best_mse_loss = {}
+        self._best_calibration_loss = {}
         # The threshold for algo = abs_max, mse or avg
         self._quantized_threshold = {}
 
@@ -408,7 +408,7 @@ class PostTrainingQuantization(object):
                 np.array(self._quantized_var_avg[var_name]).mean()
         if self._algo in ["KL", "hist"]:
             self._calculate_kl_hist_threshold()
-        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse", "emd"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -582,6 +582,8 @@ class PostTrainingQuantization(object):
             self._sample_min_max()
         elif self._algo == "mse":
             self._sample_mse()
+        elif self._algo == "emd":
+            self._sample_emd()
         elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
@@ -610,8 +612,8 @@ class PostTrainingQuantization(object):
             abs_max_value = float(np.max(np.abs(var_tensor)))
             abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
             s = 0.3
-            if var_name not in self._best_mse_loss:
-                self._best_mse_loss[var_name] = float('inf')
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
             while s <= 1.0:
                 scale = s * abs_max_value
                 s += 0.02
@@ -620,8 +622,49 @@ class PostTrainingQuantization(object):
                     np.clip(var_tensor, 0.0, scale) / scale *
                     bins) / bins * scale
                 mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
-                if mse_loss <= self._best_mse_loss[var_name]:
-                    self._best_mse_loss[var_name] = mse_loss
+                if mse_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_emd(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("EMD searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
+            s = 0.3
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                emd_loss = np.abs(
+                    np.mean(var_tensor) - np.mean(quant_dequant_var)) + np.abs(
+                        np.std(var_tensor) - np.std(quant_dequant_var))
+                if emd_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = emd_loss
                     self._quantized_threshold[var_name] = scale
 
     def _sample_avg(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index da5c5d6dc94..4b70f5b1037 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -244,6 +244,26 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
                       quant_iterations)
 
 
+class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "emd"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
     def test_post_training_avg(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 71611048610..f83306aca1d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -394,5 +394,27 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
                       diff_threshold)
 
 
+class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "emd"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 594e412d35dc307acc0c93f2c44a2ce5ecaeb42f Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Fri, 11 Mar 2022 14:47:48 +0800
Subject: [PATCH 253/272] minor fix matmul and onehot xpu. test=kunlun (#40419)

---
 paddle/fluid/operators/matmul_v2_op_xpu.cc           |  2 +-
 paddle/fluid/platform/device/xpu/xpu2_op_list.h      |  2 +-
 .../tests/unittests/xpu/test_matmul_v2_op_xpu.py     | 12 ++++++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index 1524a50f1ac..87df75ac465 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
   auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
       ColumnMatrixFromVector(y_dims), 0, trans_y);
 
-  if (x_dims.size() == 3 && y_dims.size() <= 2) {
+  if (x_dims.size() >= 3 && y_dims.size() <= 2) {
     // if transpose_X is true, the transpose cost much time
     if (!trans_x) {
       mat_dim_a.height_ *= mat_dim_a.batch_size_;
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 3789ec322ac..14f516235a7 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -249,7 +249,7 @@ XPUOpMap& get_kl2_ops() {
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::INT64, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                     pOpKernelType(vartype::FP16, XPUPlace())})},
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 45d60c8538e..9891da6ea21 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -289,6 +289,18 @@ class TestMatMulOp17(TestMatMulV2Op):
         self.trans_y = False
 
 
+class TestMatMulOp18(TestMatMulV2Op):
+    """
+    case 18 : for ppyoloe model
+    """
+
+    def config(self):
+        self.x_shape = (8, 111, 4, 17)
+        self.y_shape = (17)
+        self.trans_x = False
+        self.trans_y = False
+
+
 # class TestMatMulOpBroadcast1(TestMatMulV2Op):
 #     """
 #     case 14_3
-- 
GitLab


From 42ddee4e52ea140f8a0fab8079194ff141cfd3d5 Mon Sep 17 00:00:00 2001
From: wuyefeilin <30919197+wuyefeilin@users.noreply.github.com>
Date: Fri, 11 Mar 2022 15:15:48 +0800
Subject: [PATCH 254/272] [phi] Move erf op to phi (#40388)

* mv erf op to phi

* fix as review

* fix as review

* fix format
---
 paddle/fluid/operators/erf_op.cc              | 44 +++----------
 paddle/fluid/operators/erf_op.h               | 66 -------------------
 paddle/phi/kernels/cpu/erf_grad_kernel.cc     | 27 ++++++++
 paddle/phi/kernels/cpu/erf_kernel.cc          | 22 +++++++
 paddle/phi/kernels/erf_grad_kernel.h          | 27 ++++++++
 paddle/phi/kernels/erf_kernel.h               | 24 +++++++
 paddle/phi/kernels/gpu/erf_grad_kernel.cu     | 27 ++++++++
 paddle/phi/kernels/gpu/erf_kernel.cu          | 22 +++++++
 .../phi/kernels/impl/erf_grad_kernel_impl.h   | 40 +++++++++++
 paddle/phi/kernels/impl/erf_kernel_impl.h     | 36 ++++++++++
 paddle/phi/ops/compat/erf_sig.cc              | 26 ++++++++
 11 files changed, 259 insertions(+), 102 deletions(-)
 delete mode 100644 paddle/fluid/operators/erf_op.h
 create mode 100644 paddle/phi/kernels/cpu/erf_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/erf_kernel.cc
 create mode 100644 paddle/phi/kernels/erf_grad_kernel.h
 create mode 100644 paddle/phi/kernels/erf_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/erf_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/erf_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/erf_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/erf_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/erf_sig.cc

diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc
index f68f6703948..64274d098c0 100644
--- a/paddle/fluid/operators/erf_op.cc
+++ b/paddle/fluid/operators/erf_op.cc
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/erf_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel {
         const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of ErfOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of ErfOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker,
                   ops::ErfGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ErfGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ErfGradOpMaker<paddle::imperative::OpBase>,
+                  ErfInferShapeFunctor);
 REGISTER_OPERATOR(erf_grad, ops::ErfGradOp);
-REGISTER_OP_CPU_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CPU_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h
deleted file mode 100644
index 4780b2e7f5b..00000000000
--- a/paddle/fluid/operators/erf_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ErfKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErf<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
-                                                     eigen_in);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ErfGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(place, eigen_dx,
-                                                         eigen_x, eigen_dout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/erf_grad_kernel.cc b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
new file mode 100644
index 00000000000..3c1cd0df153
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/erf_kernel.cc b/paddle/phi/kernels/cpu/erf_kernel.cc
new file mode 100644
index 00000000000..05ce4cab7fc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erf_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, CPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/erf_grad_kernel.h b/paddle/phi/kernels/erf_grad_kernel.h
new file mode 100644
index 00000000000..8957fcaf79b
--- /dev/null
+++ b/paddle/phi/kernels/erf_grad_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/erf_kernel.h b/paddle/phi/kernels/erf_kernel.h
new file mode 100644
index 00000000000..1d5c57d2201
--- /dev/null
+++ b/paddle/phi/kernels/erf_kernel.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/erf_grad_kernel.cu b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
new file mode 100644
index 00000000000..a06863b0a87
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/erf_kernel.cu b/paddle/phi/kernels/gpu/erf_kernel.cu
new file mode 100644
index 00000000000..8e741be3345
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, GPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/erf_grad_kernel_impl.h b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
new file mode 100644
index 00000000000..5908d9d7dcb
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_dx, eigen_x, eigen_dout);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_kernel_impl.h b/paddle/phi/kernels/impl/erf_kernel_impl.h
new file mode 100644
index 00000000000..aa1f4d349ab
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErf<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_out, eigen_in);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/erf_sig.cc b/paddle/phi/ops/compat/erf_sig.cc
new file mode 100644
index 00000000000..784727a9804
--- /dev/null
+++ b/paddle/phi/ops/compat/erf_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "erf_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(erf_grad, phi::ErfGradOpArgumentMapping);
-- 
GitLab


From 8cabb9f3c98f9075ee7be04ae633e8f6ad858ba7 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Fri, 11 Mar 2022 15:21:08 +0800
Subject: [PATCH 255/272] [Phi]Move expand_as kernel to phi (#40373)

* first commit

* fix

* fix

* fix

* fix

* fix

* fix xpu and npu

* fix
---
 paddle/fluid/operators/expand_as_v2_op.cc     |  30 +--
 paddle/fluid/operators/expand_as_v2_op.h      | 214 ------------------
 paddle/phi/core/compat/op_utils.h             |   2 +
 .../phi/kernels/cpu/expand_as_grad_kernel.cc  |  28 +++
 paddle/phi/kernels/cpu/expand_as_kernel.cc    |  29 +++
 paddle/phi/kernels/expand_as_grad_kernel.h    |  28 +++
 paddle/phi/kernels/expand_as_kernel.h         |  28 +++
 .../phi/kernels/gpu/expand_as_grad_kernel.cu  |  28 +++
 paddle/phi/kernels/gpu/expand_as_kernel.cu    |  29 +++
 .../kernels/impl/expand_as_grad_kernel_impl.h | 129 +++++++++++
 .../phi/kernels/impl/expand_as_kernel_impl.h  | 145 ++++++++++++
 paddle/phi/ops/compat/expand_as_sig.cc        |  38 ++++
 12 files changed, 485 insertions(+), 243 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/expand_as_kernel.cc
 create mode 100644 paddle/phi/kernels/expand_as_grad_kernel.h
 create mode 100644 paddle/phi/kernels/expand_as_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/expand_as_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/expand_as_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/expand_as_sig.cc

diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 119e514a49e..97a35a34f23 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -121,37 +121,9 @@ REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
-#endif
 
 REGISTER_OP_VERSION(expand_as_v2)
     .AddCheckpoint(
         R"ROC(fix expand_as_v2 and add new input [Y])ROC",
         paddle::framework::compatible::OpVersionDesc().NewInput(
-            "Y", "Expand X according to the shape of Y"));
\ No newline at end of file
+            "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index d7560efc5c1..f09e7764eed 100755
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -32,219 +32,5 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T>
-class ExpandAsV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank, rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
-    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank, MAX_RANK_SUPPORTED));
-
-    switch (target_rank) {
-      case 1:
-        ExpandAs<1>(context);
-        break;
-      case 2:
-        ExpandAs<2>(context);
-        break;
-      case 3:
-        ExpandAs<3>(context);
-        break;
-      case 4:
-        ExpandAs<4>(context);
-        break;
-      case 5:
-        ExpandAs<5>(context);
-        break;
-      case 6:
-        ExpandAs<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (i < diff) {
-        PADDLE_ENFORCE_GT(
-            target_shape[i], 0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_as_v2 op.",
-                target_shape[i]));
-        repeat_times[i] = target_shape[i];
-      } else if (target_shape[i] > 0) {
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i], target_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_as_v2 op.",
-                  vec_in_dims[i], target_shape[i]));
-          repeat_times[i] = 1;
-        } else {
-          repeat_times[i] = target_shape[i];
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(
-            target_shape[i], -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_as_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                target_shape[i]));
-        repeat_times[i] = 1;
-      }
-    }
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                 bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto x_dims = in0->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      repeat_times[i] = target_shape[i] / vec_in_dims[i];
-    }
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
-                            out0);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be greater than or "
-                            "equal to 1, but the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandAsBackward(const framework::ExecutionContext& context,
-                        const std::vector<int>& reshape_dims_vec,
-                        const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index fea79766a6b..f2b7f00cb6b 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -51,7 +51,9 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
+                                                           "expand_as",
                                                            "expand_grad",
+                                                           "expand_as_grad",
                                                            "sum",
                                                            "top_k",
                                                            "top_k_grad"});
diff --git a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
new file mode 100644
index 00000000000..6eafe9aa49d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc
new file mode 100644
index 00000000000..697ea138097
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/expand_as_grad_kernel.h b/paddle/phi/kernels/expand_as_grad_kernel.h
new file mode 100644
index 00000000000..675e03c42a3
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
new file mode 100644
index 00000000000..971ea32310f
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
new file mode 100644
index 00000000000..273851cfd8b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
new file mode 100644
index 00000000000..0972eebeabf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
new file mode 100644
index 00000000000..6ef282d4703
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+namespace phi {
+template <typename Context, typename T, int Dims>
+void ExpandAsBackward(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec,
+                      DenseTensor* in_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  ctx.template Alloc<T>(in_grad);
+  auto x_grad = EigenVector<T>::Flatten(*in_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+  auto out_grad0 = EigenVector<T>::Flatten(out_grad);
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, x_grad, out_grad0, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& context,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad) {
+  auto x_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(x_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    repeat_times[i] = target_shape[i] / vec_in_dims[i];
+  }
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times[i]);
+    reshape_dims_vec.push_back(vec_in_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times.size(); i++) {
+    if (repeat_times[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    context.template Alloc<T>(in_grad);
+    phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
+  } else {
+    PADDLE_ENFORCE_GE(
+        dims,
+        1,
+        errors::InvalidArgument("The rank of the input 'Out@GRAD' for "
+                                "expand_as_v2_grad op must be greater than or "
+                                "equal to 1, but the value received is %d.",
+                                dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for "
+                          "expand_as_v2_grad op must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        ExpandAsBackward<Context, T, 1>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 2:
+        ExpandAsBackward<Context, T, 2>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 3:
+        ExpandAsBackward<Context, T, 3>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 4:
+        ExpandAsBackward<Context, T, 4>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 5:
+        ExpandAsBackward<Context, T, 5>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 6:
+        ExpandAsBackward<Context, T, 6>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
new file mode 100644
index 00000000000..e5138e4e12c
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void ExpandAs(const Context& context,
+              const DenseTensor& x,
+              const std::vector<int>& target_shape,
+              DenseTensor* out) {
+  auto in_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        target_shape[i],
+        0,
+        errors::InvalidArgument("The value of target shape cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          target_shape[i],
+          0,
+          errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand_as_v2 op.",
+              target_shape[i]));
+      repeat_times[i] = target_shape[i];
+    } else if (target_shape[i] > 0) {
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i],
+            target_shape[i],
+            errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand_as_v2 op.",
+                vec_in_dims[i],
+                target_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = target_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          target_shape[i],
+          -1,
+          errors::InvalidArgument(
+              "When the value in shape is negative for expand_as_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              target_shape[i]));
+      repeat_times[i] = 1;
+    }
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  phi::DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  phi::DDim out_dims = phi::make_ddim(target_shape);
+
+  out->Resize(out_dims);
+  context.template Alloc<T>(out);
+  auto x0 = EigenTensor<T, Rank>::From(x, new_in_dims);
+  auto y = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *context.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+      place, y, x0, bcast_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto target_rank = target_shape.size();
+  PADDLE_ENFORCE_GE(target_rank,
+                    rank,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be greater than or equal to "
+                        "the rank (%d) of the input 'x'.",
+                        target_rank,
+                        rank));
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                              "expand_as_v2 op must be positive.",
+                              rank));
+  PADDLE_ENFORCE_LE(target_rank,
+                    MAX_RANK_SUPPORTED,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be less than or equal to %d.",
+                        target_rank,
+                        MAX_RANK_SUPPORTED));
+
+  switch (target_rank) {
+    case 1:
+      ExpandAs<Context, T, 1>(ctx, x, target_shape, out);
+      break;
+    case 2:
+      ExpandAs<Context, T, 2>(ctx, x, target_shape, out);
+      break;
+    case 3:
+      ExpandAs<Context, T, 3>(ctx, x, target_shape, out);
+      break;
+    case 4:
+      ExpandAs<Context, T, 4>(ctx, x, target_shape, out);
+      break;
+    case 5:
+      ExpandAs<Context, T, 5>(ctx, x, target_shape, out);
+      break;
+    case 6:
+      ExpandAs<Context, T, 6>(ctx, x, target_shape, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/expand_as_sig.cc b/paddle/phi/ops/compat/expand_as_sig.cc
new file mode 100644
index 00000000000..a616b63c10b
--- /dev/null
+++ b/paddle/phi/ops/compat/expand_as_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ExpandAsOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"});
+}
+
+KernelSignature ExpandAsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as_grad",
+                         {"X", GradVarName("Out")},
+                         {"target_shape"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as);
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad,
+                           phi::ExpandAsGradOpArgumentMapping);
-- 
GitLab


From ad037caa824c1a6e96426a79479a831c76e883ec Mon Sep 17 00:00:00 2001
From: chenenquan <chenenquan0612@hotmail.com>
Date: Fri, 11 Mar 2022 15:41:00 +0800
Subject: [PATCH 256/272] [PHI] Migrate shard_index op (#40254)

---
 paddle/fluid/operators/shard_index_op.cc     | 37 +++-----
 paddle/fluid/operators/shard_index_op.cu     | 96 -------------------
 paddle/fluid/operators/shard_index_op.h      | 84 -----------------
 paddle/fluid/operators/shard_index_op_npu.cc |  2 +-
 paddle/phi/infermeta/unary.cc                | 28 ++++++
 paddle/phi/infermeta/unary.h                 |  8 ++
 paddle/phi/kernels/cpu/shard_index_kernel.cc | 91 ++++++++++++++++++
 paddle/phi/kernels/gpu/shard_index_kernel.cu | 99 ++++++++++++++++++++
 paddle/phi/kernels/shard_index_kernel.h      | 30 ++++++
 9 files changed, 268 insertions(+), 207 deletions(-)
 delete mode 100644 paddle/fluid/operators/shard_index_op.cu
 delete mode 100644 paddle/fluid/operators/shard_index_op.h
 create mode 100644 paddle/phi/kernels/cpu/shard_index_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/shard_index_kernel.cu
 create mode 100644 paddle/phi/kernels/shard_index_kernel.h

diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index 54555e494ff..053a90f2fc9 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,27 +23,6 @@ namespace operators {
 class ShardIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 2, "
-                          "but the value given is %d.",
-                          x_dims.size()));
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U,
-                        platform::errors::InvalidArgument(
-                            "The last dimension of Input(X) should be 1, "
-                            "but the value given is %d.",
-                            x_dims[x_dims.size() - 1]));
-    }
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -114,7 +96,10 @@ Examples:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp,
-                             ops::ShardIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel<int>,
-                       ops::ShardIndexCPUKernel<int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor,
+                            PD_INFER_META(phi::ShardIndexInferMeta));
+REGISTER_OPERATOR(
+    shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShardIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
deleted file mode 100644
index 115b3f47d66..00000000000
--- a/paddle/fluid/operators/shard_index_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void ShardIndexInner(const T* in_data, T* out_data,
-                                const int64_t numel, const int index_num,
-                                const int nshards, const int shard_id,
-                                const int ignore_value) {
-  int shard_size = (index_num + nshards - 1) / nshards;
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
-    if (in_data[idx] / shard_size == shard_id) {
-      out_data[idx] = in_data[idx] % shard_size;
-    } else {
-      out_data[idx] = ignore_value;
-    }
-  }
-}
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class ShardIndexCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel<int>,
-                        ops::ShardIndexCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h
deleted file mode 100644
index c2fe3711686..00000000000
--- a/paddle/fluid/operators/shard_index_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class ShardIndexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    int shard_size = (index_num + nshards - 1) / nshards;
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    for (int64_t i = 0; i < numel; ++i) {
-      PADDLE_ENFORCE_GE(in_data[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be "
-                            "greater or equal to 0, but the value given is %d.",
-                            in_data[i]));
-      PADDLE_ENFORCE_LT(in_data[i], index_num,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be less "
-                            "than index_num (%d), but the value given is %d.",
-                            index_num, in_data[i]));
-      if (in_data[i] / shard_size == shard_id) {
-        out_data[i] = in_data[i] % shard_size;
-      } else {
-        out_data[i] = ignore_value;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index dc2e8ad58f3..c875448424a 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index d7e2bc1767a..c26af34f771 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1312,6 +1312,34 @@ void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
   out->set_dtype(DataType::INT64);
 }
 
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  auto x_dims = in.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, "
+                                   "but the value given is %d.",
+                                   x_dims.size()));
+  if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) {
+    PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1],
+                      1U,
+                      phi::errors::InvalidArgument(
+                          "The last dimension of Input(X) should be 1, "
+                          "but the value given is %d.",
+                          x_dims[x_dims.size() - 1]));
+  }
+
+  out->set_dims(x_dims);
+  out->share_lod(in);
+  out->set_dtype(in.dtype());
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a3e5628a4d7..59ee613b8b0 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -190,4 +190,12 @@ void EighInferMeta(const MetaTensor& x,
 
 void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
 
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/shard_index_kernel.cc b/paddle/phi/kernels/cpu/shard_index_kernel.cc
new file mode 100644
index 00000000000..a82bb8ce592
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shard_index_kernel.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(
+      nshards,
+      0,
+      errors::InvalidArgument("The value 'nshard' for Op(shard_index) must be "
+                              "greater than 0, but the value given is %d.",
+                              nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  int shard_size = (index_num + nshards - 1) / nshards;
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    PADDLE_ENFORCE_GE(in_data[i],
+                      0,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be "
+                          "greater or equal to 0, but the value given is %d.",
+                          in_data[i]));
+    PADDLE_ENFORCE_LT(in_data[i],
+                      index_num,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be less "
+                          "than index_num (%d), but the value given is %d.",
+                          index_num,
+                          in_data[i]));
+    if (in_data[i] / shard_size == shard_id) {
+      out_data[i] = in_data[i] % shard_size;
+    } else {
+      out_data[i] = ignore_value;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, CPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
new file mode 100644
index 00000000000..0bd7b93f689
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void ShardIndexInner(const T* in_data,
+                                T* out_data,
+                                const int64_t numel,
+                                const int index_num,
+                                const int nshards,
+                                const int shard_id,
+                                const int ignore_value) {
+  int shard_size = (index_num + nshards - 1) / nshards;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
+    if (in_data[idx] / shard_size == shard_id) {
+      out_data[idx] = in_data[idx] % shard_size;
+    } else {
+      out_data[idx] = ignore_value;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(nshards,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The value 'nshard' for Op(shard_index) must be "
+                        "greater than 0, but the value given is %d.",
+                        nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  auto stream = dev_ctx.stream();
+  ShardIndexInner<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(
+      in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, GPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/shard_index_kernel.h b/paddle/phi/kernels/shard_index_kernel.h
new file mode 100644
index 00000000000..54ad9a14fa0
--- /dev/null
+++ b/paddle/phi/kernels/shard_index_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out);
+
+}  // namespace phi
-- 
GitLab


From 807bff4ab146e7d391f548b580c3904623713924 Mon Sep 17 00:00:00 2001
From: helen88 <z8hanghuan@126.com>
Date: Fri, 11 Mar 2022 15:45:05 +0800
Subject: [PATCH 257/272] update square & sigmoid unittest (#40404)

---
 .../unittests/xpu/test_activation_op_xpu.py   | 52 ++++++++++++++++---
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 6a7e5f08b5e..d50c0fecdee 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -84,13 +84,33 @@ class XPUTestSigmoidOP(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = "sigmoid"
             self.dtype = self.in_type
+            self.init_config()
+            out = 1 / (1 + np.exp(-self.x))
 
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = 1 / (1 + np.exp(-x))
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSigmoid2(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSigmoid3(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [10, 12, 15]).astype(self.dtype)
+
+    class XPUTestSigmoid4(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [19, 19]).astype(self.dtype)
+
+    class XPUTestSigmoid5(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2,
+                                       [10, 20, 30, 40]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('sigmoid')
 for stype in support_types:
@@ -292,14 +312,32 @@ class XPUTestSquareOP(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = "square"
             self.dtype = self.in_type
-
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = np.square(x)
+            self.init_config()
+            out = np.square(self.x)
 
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSquare2(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSquare3(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 15, 19]).astype(self.dtype)
+
+    class XPUTestSquare4(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100, 10]).astype(self.dtype)
+
+    class XPUTestSquare5(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 2, 5, 17]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('square')
 for stype in support_types:
-- 
GitLab


From f452ad5cebb35880139e58882982fc8a92bbafc6 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 11 Mar 2022 16:18:07 +0800
Subject: [PATCH 258/272] [Phi] Reduce grad (#40263)

* add reduce_sum grad kernel

* add reduce_grad

* modify reduce grad

* update reduce grad functions

* fix build error

* add argument mapping

* move cast input after grad

* add dims.size=1 cpu reduce_sum grad compute method

* update reduce grad GPU

* remove raw reduce_sum_grad kernel

* modify header files

* add namespace funcs for reduce_grad_funcstions
---
 .../performance_tests/benchmark_eager_cuda.cc |   2 +-
 .../performance_tests/benchmark_fluid_cuda.cc |   2 +-
 .../new_executor/standalone_executor_test.cc  |   2 +-
 paddle/fluid/imperative/tests/test_tracer.cc  |   2 +-
 .../operators/reduce_ops/reduce_sum_op.cc     |  13 --
 .../reduce_ops/reduce_sum_op.part.cu          |  29 ---
 paddle/phi/core/compat/op_utils.h             |   1 +
 paddle/phi/kernels/cpu/reduce_grad.h          | 132 +++++++++++++
 .../phi/kernels/cpu/reduce_sum_grad_kernel.cc | 139 ++++++++++++++
 .../phi/kernels/funcs/reduce_grad_functions.h | 177 ++++++++++++++++++
 paddle/phi/kernels/gpu/reduce_grad.h          |   1 +
 .../phi/kernels/gpu/reduce_sum_grad_kernel.cu |  90 +++++++++
 paddle/phi/kernels/reduce_sum_grad_kernel.h   |  32 ++++
 paddle/phi/ops/compat/reduce_sig.cc           |  12 ++
 14 files changed, 588 insertions(+), 46 deletions(-)
 delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
 create mode 100644 paddle/phi/kernels/cpu/reduce_grad.h
 create mode 100644 paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/reduce_grad_functions.h
 create mode 100644 paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/reduce_sum_grad_kernel.h

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 14e7ce8cfcf..9f59f4fc030 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -186,7 +186,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index e9b7d10070d..df77fc1360b 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -248,7 +248,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index a69cc0d6b86..219aae71127 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -37,7 +37,7 @@ USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index d05036f7a12..0696de908a9 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -591,5 +591,5 @@ TEST(test_tracer, eager_tracer) {
 USE_OP(mul);
 USE_OP(mul_grad);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6441d53239e..2a78774f370 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -114,16 +114,3 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceSumGradKernel =
-    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
-                             ops::SumGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum_grad, CPUReduceSumGradKernel<bool>,
-    CPUReduceSumGradKernel<float>, CPUReduceSumGradKernel<double>,
-    CPUReduceSumGradKernel<paddle::platform::float16>,
-    CPUReduceSumGradKernel<int>, CPUReduceSumGradKernel<int64_t>,
-    CPUReduceSumGradKernel<paddle::platform::complex<float>>,
-    CPUReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
deleted file mode 100644
index 2f6bf127518..00000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-template <typename T>
-using CUDAReduceSumGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::IdentityFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
-    CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
-    CUDAReduceSumGradKernel<paddle::platform::float16>,
-    CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
-    CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index f2b7f00cb6b..00e9bff9bd5 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -55,6 +55,7 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "expand_grad",
                                                            "expand_as_grad",
                                                            "sum",
+                                                           "sum_grad",
                                                            "top_k",
                                                            "top_k_grad"});
 
diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/cpu/reduce_grad.h
new file mode 100644
index 00000000000..f56d3d3ed50
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_grad.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
+
+namespace phi {
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* input1 = out.get_ptr();
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = x.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+  // NOTE: EigenTensor::From() uses tensor->data()
+  // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
+  // kNoNeedBufferY should set true
+  // and use fake var that has same dims.
+  if (kNoNeedBufferX) {
+    input0 = output;
+  }
+  if (kNoNeedBufferY) {
+    input1 = &input2;
+  }
+
+  const std::vector<int> const_dims{dims.begin(), dims.end()};
+
+  // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
+  // not be set as Input in grad Maker, use Out_grad to replace here
+  if (!input1) input1 = &input2;
+  Functor functor;
+
+  funcs::LaunchReduceGradKernel<Context, T, Functor>(dev_ctx,
+                                                     input0,
+                                                     input1,
+                                                     &input2,
+                                                     output,
+                                                     functor,
+                                                     const_dims,
+                                                     reduce_all);
+}
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  if (in_dtype != DataType::UNDEFINED) {
+    DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+    DenseTensor x_grad_tmp =
+        phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        &x_grad_tmp);
+
+    phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+  } else {
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        x_grad);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
new file mode 100644
index 00000000000..efea054555e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/cpu/reduce_grad.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+namespace phi {
+
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+template <typename T, typename Context>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  const auto* input2_d = input2.data<T>();
+  auto* output_d = output->data<T>();
+
+  // handle reduce_all
+  if (input2.dims().size() == 1 && input2.dims()[0] == 1) {
+    for (int64_t i = 0; i < phi::product(input0->dims()); ++i) {
+      output_d[i] = input2_d[0];
+    }
+    return;
+  }
+
+  // handle reduce by one dimension
+  int reduce_dim_index = dims[0];
+  if (reduce_dim_index < 0) {
+    reduce_dim_index += input0->dims().size();
+  }
+
+  auto& input_dim = input0->dims();
+  int64_t before_dim = 1;
+  for (int i = 0; i < reduce_dim_index; ++i) {
+    before_dim *= input_dim[i];
+  }
+  int64_t reduce_dim = input_dim[reduce_dim_index];
+  int64_t after_dim = 1;
+  for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+    after_dim *= input_dim[i];
+  }
+  for (int64_t i = 0; i < before_dim; ++i) {
+    for (int64_t j = 0; j < reduce_dim; ++j) {
+      for (int64_t k = 0; k < after_dim; ++k) {
+        output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+            input2_d[i * after_dim + k];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  if (dims.size() == 1) {
+    if (out_dtype != DataType::UNDEFINED) {
+      DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+      DenseTensor x_grad_tmp =
+          phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, &x_grad_tmp);
+
+      phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+
+    } else {
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, x_grad);
+    }
+  }
+
+  ReduceGradKernel<Context, T, SumGradFunctor, true>(dev_ctx,
+                                                     x,
+                                                     out_grad,
+                                                     paddle::none,
+                                                     dims,
+                                                     keep_dim,
+                                                     reduce_all,
+                                                     in_dtype,
+                                                     out_dtype,
+                                                     x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
new file mode 100644
index 00000000000..3488b6f2f86
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+
+namespace funcs {
+
+// This ReduceGradFunctor is only the CPU implement.
+template <typename Context, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const Context& dev_ctx,
+                       const DenseTensor& input0,
+                       const DenseTensor& input1,
+                       const DenseTensor& input2,
+                       DenseTensor* output,
+                       Functor functor,
+                       const std::vector<int>& dims) {
+  auto x = phi::EigenTensor<T, D>::From(input0);
+  auto x_grad = phi::EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = phi::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = phi::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  functor(place,
+          &x,
+          &x_reduce,
+          &x_grad,
+          &x_reduce_grad,
+          broadcast_dim,
+          broad_cats_times);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledDim(src_dim, &shuffled_dims, dims_64, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename Context, typename T, typename Functor>
+void HandleLargeDimGrad(const Context& dev_ctx,
+                        const DenseTensor* x,
+                        const DenseTensor* out,
+                        const DenseTensor* dout,
+                        DenseTensor* dx,
+                        Functor functor,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  DenseTensor shuffled_x;
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledInput<Context, T>(dev_ctx, *x, &shuffled_x, dims_64);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<Context, T, 2, Functor>(
+      dev_ctx, shuffled_x, *out, *dout, dx, functor, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  DenseTensor dx_tmp;
+  paddle::framework::TensorCopy(*dx, dev_ctx.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  phi::funcs::TransposeNormal<Context, T> trans;
+  trans(dev_ctx, dx_tmp, dx, origin_axis);
+}
+
+// Only for CPU
+template <typename Context, typename T, typename Functor>
+void LaunchReduceGradKernel(const Context& dev_ctx,
+                            const DenseTensor* input0,
+                            const DenseTensor* input1,
+                            const DenseTensor* input2,
+                            DenseTensor* output,
+                            Functor functor,
+                            const std::vector<int>& dims,
+                            bool reduce_all = false) {
+  if (reduce_all) {
+    auto x = phi::EigenVector<T>::Flatten(*input0);
+    auto x_reduce = phi::EigenVector<T>::Flatten(*input1);
+    auto x_reduce_grad = phi::EigenVector<T>::Flatten(*input2);
+    auto x_grad = phi::EigenVector<T>::Flatten(*output);
+    auto& place = *dev_ctx.eigen_device();
+    // *dev_ctx.eigen_device();
+    auto broadcast_dim =
+        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+    functor(place,
+            &x,
+            &x_reduce,
+            &x_grad,
+            &x_reduce_grad,
+            broadcast_dim,
+            broadcast_dim[0]);
+  } else {
+    int rank = input0->dims().size();
+    switch (rank) {
+      case 1:
+        ReduceGradFunctor<Context, T, 1, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 2:
+        ReduceGradFunctor<Context, T, 2, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 3:
+        ReduceGradFunctor<Context, T, 3, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 4:
+        ReduceGradFunctor<Context, T, 4, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 5:
+        ReduceGradFunctor<Context, T, 5, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 6:
+        ReduceGradFunctor<Context, T, 6, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      default:
+        HandleLargeDimGrad<Context, T, Functor>(
+            dev_ctx, input0, input1, input2, output, functor, dims);
+        break;
+    }
+  }
+}
+
+}  // namespace funcs
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index a2b1c8631c7..d21c8a3fa46 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -23,6 +23,7 @@
 #include <set>
 #include <vector>
 
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
new file mode 100644
index 00000000000..9f4ddc3cf37
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  auto pt_out_dtype = in_dtype;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+
+  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      kps::IdentityFunctor<T, MPType>(reduce_num));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/reduce_sum_grad_kernel.h
new file mode 100644
index 00000000000..ab4d63297ef
--- /dev/null
+++ b/paddle/phi/kernels/reduce_sum_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 36798abe4c1..997f1505bd0 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -74,13 +74,25 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceSumGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sum_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
+                           phi::ReduceSumGradOpArgumentMapping);
-- 
GitLab


From 34d4b40d314c9888c9fa6ac5714a5023fad2f81a Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Fri, 11 Mar 2022 16:22:08 +0800
Subject: [PATCH 259/272] Simplify the softmax kernel and add the check of
 whether cudnn softmax can be used. (#40424)

---
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 344 ++++++++++-----------
 1 file changed, 165 insertions(+), 179 deletions(-)

diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index c9c549379bb..0352fdf6fa2 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -79,7 +79,7 @@ class VecT2<phi::dtype::bfloat16> {
   using Type = int;
 };
 
-static inline int log2_ceil(int value) {
+static inline int Log2Ceil(int value) {
   int log2_value = 0;
   while ((1 << log2_value) < value) ++log2_value;
   return log2_value;
@@ -577,8 +577,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
 #else
   constexpr int max_num_threads = 1024;
 #endif
-  int block_x = 1 << log2_ceil(low_dim);
-  int block_y = 1 << log2_ceil(mid_dim);
+  int block_x = 1 << Log2Ceil(low_dim);
+  int block_y = 1 << Log2Ceil(mid_dim);
   block->x = std::min(block_x, 32);
   block->y = std::min(block_y, static_cast<int>(max_num_threads / block->x));
   block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
@@ -739,6 +739,131 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   }
 }
 
+static std::vector<int> GetSoftmaxTensorDims(const phi::DDim& dims,
+                                             const int axis) {
+  int dim = dims[axis];
+  int N = phi::funcs::SizeToAxis(axis, dims);
+  int D = phi::funcs::SizeOutAxis(axis, dims);
+  return {N, dim, D, 1};
+}
+
+template <typename T>
+void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
+                               const DenseTensor& x,
+                               const int axis,
+                               const bool log_mode,
+                               DenseTensor* out) {
+  auto* out_data = out->data<T>();
+
+  const int rank = x.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2(
+      handle,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data,
+      algo,
+      mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data));
+#endif
+}
+
+template <typename T>
+void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
+                                const DenseTensor& out,
+                                const DenseTensor& dout,
+                                const int axis,
+                                const bool log_mode,
+                                DenseTensor* dx) {
+  auto* dx_data = dx->data<T>();
+
+  int rank = out.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSoftmaxBackward_V2(
+          handle,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc,
+          out.data<T>(),
+          desc,
+          dout.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc,
+          dx_data,
+          algo,
+          mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      out.data<T>(),
+      desc,
+      dout.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      dx_data));
+#endif
+}
+
+template <typename T>
+static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
+  if (dev_ctx.cudnn_handle() != nullptr) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      return false;
+#endif
+    }
+    return true;
+  }
+  return false;
+}
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -746,29 +871,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     DenseTensor* out) {
   auto* out_data = out->data<T>();
 
-  auto dims = x.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = x.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = static_cast<int>(log2_ceil(dim));
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = static_cast<int>(Log2Ceil(dim));
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
 
     // use 128 threads per block to maximimize gpu utilization
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -783,7 +908,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
                                                threads,
@@ -793,7 +918,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else {
       SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
                                               threads,
@@ -803,78 +928,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                               N,
                                               dim,
                                               dim,
-                                              kDimLog2);
+                                              dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxForward<T, LogMode>(
         dev_ctx, out_data, x.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_LOG,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_ACCURATE,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    }
-#endif
+    SoftmaxForwardCudnnKernel<T>(dev_ctx, x, axis, LogMode, out);
   }
 }
 
@@ -886,27 +946,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                      DenseTensor* dx) {
   auto* dx_data = dx->data<T>();
 
-  auto dims = out.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = out.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = log2_ceil(dim);
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = Log2Ceil(dim);
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
+
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -921,7 +982,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
                                                 threads,
@@ -932,7 +993,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else {
       SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
                                                threads,
@@ -943,88 +1004,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxBackward<T, LogMode>(
         dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_LOG,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_ACCURATE,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    }
-#endif
+    SoftmaxBackwardCudnnKernel<T>(dev_ctx, out, dout, axis, LogMode, dx);
   }
 }
 
-- 
GitLab


From 282cba488a5cb5421b6624f1d71374730ee8a326 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 11 Mar 2022 16:29:33 +0800
Subject: [PATCH 260/272] [Phi] Migrate tile_op into Phi (#40371)

* [Phi] Migrate tile_op into Phi

* fix tile_sig

* fix include headers

* fix using
---
 paddle/fluid/operators/tile_op.cc             | 101 +-----
 paddle/fluid/operators/tile_op.h              | 306 ------------------
 paddle/fluid/operators/tile_op_functor.h      |  67 ++++
 paddle/fluid/operators/tile_op_npu.cc         |   3 +-
 paddle/fluid/operators/tile_op_xpu.cc         |   5 +-
 paddle/phi/infermeta/unary.cc                 |  68 ++++
 paddle/phi/infermeta/unary.h                  |   5 +
 paddle/phi/kernels/cpu/tile_grad_kernel.cc    |  29 ++
 paddle/phi/kernels/cpu/tile_kernel.cc         |  23 ++
 paddle/phi/kernels/gpu/tile_grad_kernel.cu    |  30 ++
 paddle/phi/kernels/gpu/tile_kernel.cu         |  30 ++
 .../phi/kernels/impl/tile_grad_kernel_impl.h  | 147 +++++++++
 paddle/phi/kernels/impl/tile_kernel_impl.h    | 117 +++++++
 paddle/phi/kernels/tile_grad_kernel.h         |  31 ++
 paddle/phi/kernels/tile_kernel.h              |  30 ++
 paddle/phi/ops/compat/tile_sig.cc             |  51 +++
 16 files changed, 645 insertions(+), 398 deletions(-)
 delete mode 100644 paddle/fluid/operators/tile_op.h
 create mode 100644 paddle/fluid/operators/tile_op_functor.h
 create mode 100644 paddle/phi/kernels/cpu/tile_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/tile_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/tile_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/tile_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/tile_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/tile_kernel_impl.h
 create mode 100644 paddle/phi/kernels/tile_grad_kernel.h
 create mode 100644 paddle/phi/kernels/tile_kernel.h
 create mode 100644 paddle/phi/ops/compat/tile_sig.cc

diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index dc12f8e8892..e179149c5bb 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile");
-    auto x_dims = ctx->GetInputDim("X");
-    auto repeat_times = ctx->Attrs().Get<std::vector<int>>("repeat_times");
-    if (repeat_times.size() == 0) {
-      repeat_times = std::vector<int>(x_dims.size(), -1);
-    }
-
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        repeat_times.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times.size()));
-    PADDLE_ENFORCE_GE(
-        repeat_times.size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must be positive integers, but the value received is %d.",
-            repeat_times.size()));
-
-    auto out_rank =
-        std::max(static_cast<size_t>(x_dims.size()), repeat_times.size());
-    std::vector<int64_t> out_shape(out_rank);
-    auto x_dim_vec = phi::vectorize<int>(x_dims);
-    if (x_dim_vec.size() > repeat_times.size()) {
-      auto diff = x_dim_vec.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, -1);
-    } else {
-      auto diff = repeat_times.size() - x_dim_vec.size();
-      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
-    }
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      if (x_dim_vec[i] == -1 || repeat_times[i] == -1) {
-        out_shape[i] = -1;
-      } else {
-        PADDLE_ENFORCE_GT(
-            repeat_times[i], 0,
-            platform::errors::InvalidArgument(
-                "Every element of the input 'repeat_times' for tile op must be "
-                "greater than 0, but the value given is %d.",
-                repeat_times[i]));
-        out_shape[i] = x_dim_vec[i] * repeat_times[i];
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
-    if (out_shape[0] == x_dims[0]) {
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor,
+                            PD_INFER_META(phi::TileInferMeta));
+
 REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TileGradOpMaker<paddle::imperative::OpBase>);
+                  ops::TileGradOpMaker<paddle::imperative::OpBase>,
+                  TileInferMetaFunctor);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
                   ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-#endif
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
deleted file mode 100644
index 1698b5e3c63..00000000000
--- a/paddle/fluid/operators/tile_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-inline std::vector<int> get_repeat_times(
-    const framework::ExecutionContext& ctx) {
-  if (ctx.HasInput("RepeatTimes")) {
-    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
-    auto* repeat_data = repeat_tensor->data<int>();
-    framework::Tensor cpu_repeat_tensor;
-    if (platform::is_gpu_place(repeat_tensor->place()) ||
-        platform::is_xpu_place(repeat_tensor->place()) ||
-        platform::is_npu_place(repeat_tensor->place())) {
-      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
-                                        &cpu_repeat_tensor);
-      repeat_data = cpu_repeat_tensor.data<int>();
-    }
-    auto vec_repeat_times =
-        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
-    return vec_repeat_times;
-  }
-
-  auto list_repeat_times_tensor =
-      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
-  if (list_repeat_times_tensor.size() > 0) {
-    // get tensor from
-    std::vector<int> vec_repeat_times;
-    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
-      auto tensor = list_repeat_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place()) ||
-          platform::is_xpu_place(tensor->place()) ||
-          platform::is_npu_place(tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_repeat_times.push_back(*temp.data<int32_t>());
-      } else {
-        vec_repeat_times.push_back(*tensor->data<int32_t>());
-      }
-    }
-    return vec_repeat_times;
-  } else {
-    return ctx.Attr<std::vector<int>>("repeat_times");
-  }
-}
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::To32BitIndex;
-
-template <typename DeviceContext, typename T>
-class TileKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
-    PADDLE_ENFORCE_LE(
-        rank, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size, 1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times_size));
-    rank = std::max(rank, repeat_times_size);
-    switch (rank) {
-      case 1:
-        Tile<1>(context);
-        break;
-      case 2:
-        Tile<2>(context);
-        break;
-      case 3:
-        Tile<3>(context);
-        break;
-      case 4:
-        Tile<4>(context);
-        break;
-      case 5:
-        Tile<5>(context);
-        break;
-      case 6:
-        Tile<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto repeat_times = get_repeat_times(context);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i], 0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(), vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(), repeat_times.size()));
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    // use 32-bit index to speed up
-    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
-    if (use_32bit_index) {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
-    } else {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                   bcast_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TileGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto repeat_times = get_repeat_times(context);
-    auto x_dims = x->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    // 1. reshape_dims_vec is the broadcast parameter.
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-      dx->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*dout, context.GetPlace(), context.device_context(),
-                            dx);
-      // TensorCopy may change the dims of dx
-      dx->Resize(x_dims);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "Th rank of the input 'Out@GRAD' for tile_grad op "
-                            " must be greater than or equal to 1, but "
-                            "the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for tile_grad op "
-                            "must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void TileBackward(const framework::ExecutionContext& context,
-                    const std::vector<int>& reshape_dims_vec,
-                    const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h
new file mode 100644
index 00000000000..95bfb9f4e1a
--- /dev/null
+++ b/paddle/fluid/operators/tile_op_functor.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace paddle {
+namespace operators {
+
+inline std::vector<int> get_repeat_times(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
+    auto* repeat_data = repeat_tensor->data<int>();
+    framework::Tensor cpu_repeat_tensor;
+    if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_xpu_place(repeat_tensor->place()) ||
+        platform::is_npu_place(repeat_tensor->place())) {
+      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
+                                        &cpu_repeat_tensor);
+      repeat_data = cpu_repeat_tensor.data<int>();
+    }
+    auto vec_repeat_times =
+        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
+    return vec_repeat_times;
+  }
+
+  auto list_repeat_times_tensor =
+      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+  if (list_repeat_times_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_repeat_times;
+    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
+      auto tensor = list_repeat_times_tensor[i];
+      if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_xpu_place(tensor->place()) ||
+          platform::is_npu_place(tensor->place())) {
+        framework::Tensor temp;
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_repeat_times.push_back(*temp.data<int32_t>());
+      } else {
+        vec_repeat_times.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_repeat_times;
+  } else {
+    return ctx.Attr<std::vector<int>>("repeat_times");
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 9e306c7be53..cea6b458aec 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
index 6b60b167a24..598377587d6 100644
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -11,11 +11,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class TileXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index c26af34f771..d6d4efad9fa 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -395,6 +395,74 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config) {
+#define MAX_RANK_SUPPORTED 6
+
+  auto repeat_times_data = repeat_times.GetData();
+  auto x_dims = x.dims();
+  if (repeat_times_data.size() == 0) {
+    repeat_times_data = std::vector<int64_t>(x_dims.size(), -1);
+  }
+
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      repeat_times_data.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          repeat_times_data.size()));
+  PADDLE_ENFORCE_GE(
+      repeat_times_data.size(),
+      1,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must be positive integers, but the value received is %d.",
+          repeat_times_data.size()));
+
+  auto out_rank =
+      std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
+  std::vector<int64_t> out_shape(out_rank);
+  auto x_dim_vec = phi::vectorize<int>(x_dims);
+  if (x_dim_vec.size() > repeat_times_data.size()) {
+    auto diff = x_dim_vec.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, -1);
+  } else {
+    auto diff = repeat_times_data.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+  }
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) {
+      out_shape[i] = -1;
+    } else {
+      PADDLE_ENFORCE_GT(
+          repeat_times_data[i],
+          0,
+          errors::InvalidArgument(
+              "Every element of the input 'repeat_times' for tile op must be "
+              "greater than 0, but the value given is %d.",
+              repeat_times_data[i]));
+      out_shape[i] = x_dim_vec[i] * repeat_times_data[i];
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_shape));
+  if (out_shape[0] == x_dims[0]) {
+    out->share_lod(x);
+  }
+}
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 59ee613b8b0..e8be73e943e 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -100,6 +100,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
 void SumRawInferMeta(const MetaTensor& x,
                      const std::vector<int64_t>& axis,
                      bool keep_dim,
diff --git a/paddle/phi/kernels/cpu/tile_grad_kernel.cc b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
new file mode 100644
index 00000000000..636ade93742
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc
new file mode 100644
index 00000000000..3b590ed475a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    tile, CPU, ALL_LAYOUT, phi::TileKernel, bool, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
new file mode 100644
index 00000000000..c092609e623
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
new file mode 100644
index 00000000000..0c3c29e82c4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
new file mode 100644
index 00000000000..a2c2720244f
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Dims>
+void TileBackward(const Context& dev_ctx,
+                  const DenseTensor& out_grad,
+                  const std::vector<int>& reshape_dims_vec,
+                  const std::vector<int>& reduce_dims_vec,
+                  DenseTensor* x_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, eigen_x_grad, eigen_out_grad, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad) {
+  auto x_dims = x.dims();
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto repeat_times_data = repeat_times.GetData();
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  // 1. reshape_dims_vec is the broadcast parameter.
+  // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+  //    each dimension expanded, the gradients should be summed to original
+  //    size.
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times_data[i]);
+    reshape_dims_vec.push_back(vec_x_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times_data.size(); i++) {
+    if (repeat_times_data[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    dev_ctx.template Alloc<T>(x_grad);
+
+    paddle::framework::TensorCopy(
+        out_grad, dev_ctx.GetPlace(), dev_ctx, x_grad);
+    // TensorCopy may change the dims of dx
+    x_grad->Resize(x_dims);
+  } else {
+    PADDLE_ENFORCE_GE(dims,
+                      1,
+                      errors::InvalidArgument(
+                          "Th rank of the input 'Out@GRAD' for tile_grad op "
+                          " must be greater than or equal to 1, but "
+                          "the value received is %d.",
+                          dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        TileBackward<Context, T, 1>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 2:
+        TileBackward<Context, T, 2>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 3:
+        TileBackward<Context, T, 3>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 4:
+        TileBackward<Context, T, 4>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 5:
+        TileBackward<Context, T, 5>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 6:
+        TileBackward<Context, T, 6>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h
new file mode 100644
index 00000000000..bafbbde4e68
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_kernel_impl.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void Tile(const Context& dev_ctx,
+          const DenseTensor& x,
+          std::vector<int64_t> repeat_times,
+          DenseTensor* out) {
+  auto x_dims = x.dims();
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times[i]));
+  }
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  if (repeat_times.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  PADDLE_ENFORCE_EQ(
+      repeat_times.size(),
+      vec_x_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_x_dims.size(),
+          repeat_times.size()));
+
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim out_dims(new_x_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+
+  out->Resize(out_dims);
+  auto eigen_x = EigenTensor<T, Rank>::From(x, new_x_dims);
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *dev_ctx.eigen_device();
+  // use 32-bit index to speed up
+  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+  if (use_32bit_index) {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, To32BitIndex(eigen_out), To32BitIndex(eigen_x), bcast_dims);
+  } else {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, eigen_out, eigen_x, bcast_dims);
+  }
+}
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto& repeat_times_data = repeat_times.GetData();
+  int repeat_times_size = repeat_times_data.size();
+  rank = std::max(rank, repeat_times_size);
+
+  switch (rank) {
+    case 1:
+      Tile<Context, T, 1>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 2:
+      Tile<Context, T, 2>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 3:
+      Tile<Context, T, 3>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 4:
+      Tile<Context, T, 4>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 5:
+      Tile<Context, T, 5>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 6:
+      Tile<Context, T, 6>(dev_ctx, x, repeat_times_data, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tile_grad_kernel.h b/paddle/phi/kernels/tile_grad_kernel.h
new file mode 100644
index 00000000000..830276c28e0
--- /dev/null
+++ b/paddle/phi/kernels/tile_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tile_kernel.h b/paddle/phi/kernels/tile_kernel.h
new file mode 100644
index 00000000000..924d0149fe3
--- /dev/null
+++ b/paddle/phi/kernels/tile_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc
new file mode 100644
index 00000000000..49a6d02225d
--- /dev/null
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
+  } else {
+    return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+  }
+}
+
+KernelSignature TileGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"RepeatTimes"},
+                           {GradVarName("X")});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times_tensor"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tile, phi::TileOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tile_grad, phi::TileGradOpArgumentMapping);
-- 
GitLab


From dc77382831103e25bc17cc8f2c3307041e348b73 Mon Sep 17 00:00:00 2001
From: zn <96479180+kangna-qi@users.noreply.github.com>
Date: Fri, 11 Mar 2022 16:38:47 +0800
Subject: [PATCH 261/272] [MLU]add allgather_op mlu kernel (#40356)

---
 .../collective/c_allgather_op_mlu.cc          | 81 +++++++++++++++++++
 .../operators/collective/c_allreduce_op.h     |  2 +-
 .../collective/c_broadcast_op_mlu.cc          |  2 +-
 .../fluid/tests/unittests/mlu/CMakeLists.txt  |  2 +
 .../unittests/mlu/collective_allgather_api.py | 55 +++++++++++++
 .../unittests/mlu/collective_allgather_op.py  | 71 ++++++++++++++++
 .../mlu/test_collective_allgather.py          | 55 +++++++++++++
 .../mlu/test_collective_allgather_api_mlu.py  | 43 ++++++++++
 .../mlu/test_collective_api_base_mlu.py       |  6 ++
 .../unittests/mlu/test_collective_base_mlu.py |  4 +
 10 files changed, 319 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/operators/collective/c_allgather_op_mlu.cc
 create mode 100755 python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
 create mode 100755 python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
 create mode 100755 python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py

diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
new file mode 100644
index 00000000000..f29bc57c9a5
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+#include "paddle/fluid/framework/convert_utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::DDim out_dims = x->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint32_t send_numel = x->numel();
+    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel,
+                                             dtype, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel<float>,
+                       ops::CAllGatherOpMLUKernel<uint8_t>,
+                       ops::CAllGatherOpMLUKernel<int>,
+                       ops::CAllGatherOpMLUKernel<int8_t>,
+                       ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 7e5120cd2b3..2c4e85400ca 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d315f211709..d1e269fb5a4 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 17f5509bdb9..34037822526 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -22,8 +22,10 @@ if (WITH_MLU)
         bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT 120)
         set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
     endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
new file mode 100755
index 00000000000..50ae6b1a169
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
new file mode 100755
index 00000000000..1058514f9ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allgather",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgather, "allgather", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
new file mode 100644
index 00000000000..09166e15aac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllgatherOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_fp32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float32")
+
+    def test_allgather_fp16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float16")
+
+    def test_allgather_int32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int32")
+
+    def test_allgather_int16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int16")
+
+    def test_allgather_int8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather", "int8")
+
+    def test_allgather_uint8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
new file mode 100755
index 00000000000..576c310cc3a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_cncl_fp16(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float16")
+
+    def test_allgather_cncl_fp32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float32")
+
+    def test_allgather_cncl_int32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
index 556fc6fcbb7..3c1cf7d2d1b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -219,5 +219,11 @@ class TestDistBase(unittest.TestCase):
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 4692c893d00..9c2e2205eb8 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -270,5 +270,9 @@ class TestDistBase(unittest.TestCase):
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
         else:
             pass
-- 
GitLab


From e24ca55ef58dac35c3d1e9a3dd0c950724bc22a8 Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Fri, 11 Mar 2022 17:01:55 +0800
Subject: [PATCH 262/272] [Phi]migrate cholesky_solve op to phi (#40387)

---
 paddle/fluid/operators/cholesky_solve_op.cc   |  68 +----
 paddle/fluid/operators/cholesky_solve_op.cu   | 136 ---------
 paddle/fluid/operators/cholesky_solve_op.h    | 252 ----------------
 paddle/fluid/operators/triangular_solve_op.h  |  40 ---
 paddle/fluid/platform/dynload/CMakeLists.txt  |   2 -
 paddle/fluid/platform/dynload/lapack.cc       |  27 --
 paddle/fluid/platform/dynload/lapack.h        |  68 -----
 paddle/phi/backends/dynload/lapack.h          |   4 +-
 paddle/phi/infermeta/binary.cc                |  54 ++++
 paddle/phi/infermeta/binary.h                 |   5 +
 .../phi/kernels/cholesky_solve_grad_kernel.h  |  31 ++
 paddle/phi/kernels/cholesky_solve_kernel.h    |  28 ++
 .../kernels/cpu/cholesky_solve_grad_kernel.cc |  25 ++
 .../phi/kernels/cpu/cholesky_solve_kernel.cc  |  42 +++
 .../phi/kernels/funcs/lapack/CMakeLists.txt   |   2 +-
 .../kernels/funcs/lapack/lapack_function.cc   | 285 +++++++++---------
 .../kernels/gpu/cholesky_solve_grad_kernel.cu |  30 ++
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  | 141 +++++++++
 .../impl/cholesky_solve_grad_kernel_impl.h    | 134 ++++++++
 .../kernels/impl/cholesky_solve_kernel_impl.h | 104 +++++++
 paddle/phi/ops/compat/cholesky_solve_sig.cc   |  30 ++
 21 files changed, 775 insertions(+), 733 deletions(-)
 delete mode 100644 paddle/fluid/operators/cholesky_solve_op.cu
 delete mode 100644 paddle/fluid/operators/cholesky_solve_op.h
 delete mode 100644 paddle/fluid/platform/dynload/lapack.cc
 delete mode 100644 paddle/fluid/platform/dynload/lapack.h
 create mode 100644 paddle/phi/kernels/cholesky_solve_grad_kernel.h
 create mode 100644 paddle/phi/kernels/cholesky_solve_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/cholesky_solve_sig.cc

diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc
index 6b5bae8fc73..5403e2440ee 100644
--- a/paddle/fluid/operators/cholesky_solve_op.cc
+++ b/paddle/fluid/operators/cholesky_solve_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker {
 class CholeskySolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve");
-    auto u_dims = context->GetInputDim("Y");
-    auto b_dims = context->GetInputDim("X");
-    int u_rank = u_dims.size();
-    int b_rank = b_dims.size();
-    PADDLE_ENFORCE_GE(u_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input Y must greater or equal to 2"));
-    PADDLE_ENFORCE_GE(b_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input X must greater or equal to 2"));
-    PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2],
-                      platform::errors::InvalidArgument(
-                          "input Matrix Y should be square matrix,"
-                          "But Got last shape of %ld x %ld",
-                          u_dims[u_rank - 1], u_dims[u_rank - 2]));
-    PADDLE_ENFORCE_EQ(
-        b_dims[b_rank - 2], u_dims[u_rank - 2],
-        platform::errors::InvalidArgument(
-            "the first dim of input X must equal to the dim of input Y,"
-            "But Got %ld and %ld",
-            b_dims[b_rank - 2], u_dims[u_rank - 2]));
-
-    std::vector<int64_t> u_dims_vec = phi::vectorize(u_dims);
-    std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
-
-    std::vector<int64_t> u_dims_vec_cut(u_dims_vec.begin(),
-                                        u_dims_vec.end() - 2);
-    std::vector<int64_t> b_dims_vec_cut(b_dims_vec.begin(),
-                                        b_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut);
-
-    std::vector<int64_t> b_broadcast_dims({expand_batch_portion});
-    b_broadcast_dims.insert(b_broadcast_dims.end(),
-                            {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims));
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskySolveInferMeta));
+
 REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp,
                   ops::CholeskySolveOpMaker,
                   ops::CholeskySolveOpVarTypeInference,
                   ops::CholeskySolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>,
+                  CholeskySolveInferShapeFunctor);
 
 REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, double>);
-// Complex<> is not supported because of TensorExpand, which used to boardcast
-// input Tensor
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
deleted file mode 100644
index 1b551a7cd03..00000000000
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo,
-                    int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb,
-                    int *devInfo);
-
-template <>
-void cusolver_potrs<float>(const cusolverDnHandle_t &cusolverH,
-                           cublasFillMode_t uplo, int n, int nrhs, float *Adata,
-                           int lda, float *Bdata, int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<double>(const cusolverDnHandle_t &cusolverH,
-                            cublasFillMode_t uplo, int n, int nrhs,
-                            double *Adata, int lda, double *Bdata, int ldb,
-                            int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<float>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<float> *Adata, int lda, platform::complex<float> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs(
-      cusolverH, uplo, n, nrhs, reinterpret_cast<const cuComplex *>(Adata), lda,
-      reinterpret_cast<cuComplex *>(Bdata), ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<double>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<double> *Adata, int lda, platform::complex<double> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs(
-      cusolverH, uplo, n, nrhs,
-      reinterpret_cast<const cuDoubleComplex *>(Adata), lda,
-      reinterpret_cast<cuDoubleComplex *>(Bdata), ldb, devInfo));
-}
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-
-    /* step 1: get cusolver handle*/
-    auto cusolverH = dev_ctx.cusolver_dn_handle();
-
-    /* step 2: solve A0*X0 = B0  */
-    cusolver_potrs<T>(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda,
-                      devInfo);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-  }
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor &in, Tensor *out,
-                  const framework::ExecutionContext &ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
deleted file mode 100644
index 74b961d4e55..00000000000
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-
-namespace paddle {
-namespace operators {  // namespace operators
-
-template <typename DeviceContext, typename T>
-class CholeskySolveFunctor {
- public:
-  void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo);
-};
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    char uplo = upper ? 'U' : 'L';
-    phi::funcs::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
-                                       devInfo);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
-                       const framework::Tensor &uin,
-                       const framework::Tensor &bin, framework::Tensor *out,
-                       bool upper) {
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  // framework::Tensor broadcast
-  std::vector<int64_t> u_bst_dims_vec;
-  std::vector<int64_t> b_bst_dims_vec;
-  std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin);
-  framework::Tensor u_bst(uin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, uin, &u_bst, u_bst_dims_vec);
-
-  framework::Tensor b_bst(bin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
-
-  auto &phi_dev_ctx = static_cast<
-      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE &>(
-      dev_ctx);
-
-  // calculate u's conjugate for complex
-  framework::Tensor u_conj(u_bst.type());
-  platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
-  phi::funcs::ConjFunctor<T> u_functor(
-      u_bst.data<T>(), u_bst.numel(),
-      u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
-  u_for_range(u_functor);
-  u_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, u_conj);
-
-  // calculate b's conjugate for complex
-  framework::Tensor b_conj(b_bst.type());
-  platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
-  phi::funcs::ConjFunctor<T> b_functor(
-      b_bst.data<T>(), b_bst.numel(),
-      b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
-  b_for_range(b_functor);
-  b_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, b_conj);
-
-  auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
-  auto uindims = u_bst.dims();
-  auto bindims = b_bst.dims();
-  int uinrank = uindims.size();
-  int binrank = bindims.size();
-
-  int n = uindims[uinrank - 2];
-  int nrhs = bindims[binrank - 1];
-  int ldab = std::max(1, n);
-
-  // framework::Tensor out_copy(b_conj.type());
-  // out_copy.Resize(b_conj.dims());
-  framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out);
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2);
-  auto batchsize = product(info_dims);
-
-  framework::Tensor tmp;
-  std::vector<int> tmpdim(1, batchsize);
-  tmp.Resize(phi::make_ddim(tmpdim));
-  int *info = tmp.mutable_data<int>(dev_ctx.GetPlace());
-
-  CholeskySolveFunctor<DeviceContext, T> functor;
-  for (int b = 0; b < batchsize; b++) {
-    auto uin_data_item = &ut_data[b * n * n];
-    auto out_data_item = &out_data[b * n * nrhs];
-    auto info_item = &info[b];
-    functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item,
-            info_item);
-  }
-
-  // calculate out's conjugate for complex
-  platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-  phi::funcs::ConjFunctor<T> out_functor(
-      out->data<T>(), out->numel(),
-      out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-  out_for_range(out_functor);
-  *out = phi::TransposeLast2Dim<T>(phi_dev_ctx, *out);
-}
-
-template <typename DeviceContext, typename T>
-class CholeskySolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto upper = ctx.Attr<bool>("upper");
-    cholesky_solve_fn<DeviceContext, T>(ctx, *uin, *bin, out, upper);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CholeskySolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *out = ctx.Input<framework::Tensor>("Out");
-    auto *dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *du = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto upper = ctx.Attr<bool>("upper");
-
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto &phi_dev_ctx = static_cast<
-        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE &>(
-        dev_ctx);
-
-    std::vector<int64_t> u_bst_dims_vec;
-    std::vector<int64_t> b_bst_dims_vec;
-    std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin);
-    framework::Tensor u_bst(uin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *uin, &u_bst, u_bst_dims_vec);
-
-    framework::Tensor db_bst(bin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *bin, &db_bst, b_bst_dims_vec);
-
-    if (dout) {
-      db->mutable_data<T>(dev_ctx.GetPlace());
-      cholesky_solve_fn<DeviceContext, T>(ctx, u_bst, *dout, &db_bst, upper);
-
-      if (db_bst.dims() == db->dims()) {
-        framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(db_bst, db, ctx);
-        db->Resize(bin->dims());
-      }
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-      // calculate out's conjugate for complex
-      framework::Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-      out_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, out_conj);
-
-      framework::Tensor commonterm(out->type());
-      auto outdims = out_conj.dims();
-      auto dbdims = db_bst.dims();
-      auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false);
-      auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false);
-      auto cmtdim = outdims;
-      cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
-      commonterm.Resize(cmtdim);
-      commonterm.mutable_data<T>(dev_ctx.GetPlace());
-      blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast<T>(1),
-                  &commonterm, static_cast<T>(0));
-
-      // calculate commonterm's conjugate for complex
-      framework::Tensor commonterm_conj(commonterm.type());
-      platform::ForRange<DeviceContext> commonterm_for_range(
-          dev_ctx, commonterm.numel());
-      phi::funcs::ConjFunctor<T> commonterm_functor(
-          commonterm.data<T>(), commonterm.numel(),
-          commonterm_conj.mutable_data<T>(commonterm.dims(),
-                                          dev_ctx.GetPlace()));
-      commonterm_for_range(commonterm_functor);
-      commonterm_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, commonterm_conj);
-
-      phi::AddRawKernel<T>(
-          static_cast<const typename paddle::framework::ConvertToPhiContext<
-              DeviceContext>::TYPE &>(dev_ctx),
-          commonterm, commonterm_conj, -1, &commonterm);
-
-      auto mat_dim_u =
-          phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false);
-
-      Tensor du_bst(uin->type());
-      // get upper or lower triangular
-      du_bst.Resize(u_bst.dims());
-      du_bst.mutable_data<T>(dev_ctx.GetPlace());
-      if (upper) {
-        blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      } else {
-        blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      }
-
-      const auto &udims = u_bst.dims();
-      const auto H = udims[udims.size() - 2];
-      const auto W = udims[udims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, u_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(du_bst.data<T>(), 0, !upper, H, W,
-                                            u_bst.data<T>());
-      x_for_range(tril_triu_computer);
-
-      du->mutable_data<T>(dev_ctx.GetPlace());
-      if (u_bst.dims() == du->dims()) {
-        framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(u_bst, du, ctx);
-        du->Resize(uin->dims());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
index 315847b4d80..fd46aca456c 100644
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ b/paddle/fluid/operators/triangular_solve_op.h
@@ -60,45 +60,5 @@ static void triangular_solve(const DeviceContext &context, const Tensor &x,
           unitriangular);
 }
 
-template <typename DeviceContext, typename T>
-class MatrixReduceSumFunctor {
- public:
-  void operator()(const Tensor &input, Tensor *output,
-                  const framework::ExecutionContext &ctx);
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const Tensor &in, Tensor *out,
-                  const framework::ExecutionContext &ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-    out->Resize(phi::make_ddim(out_bst_dims));
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-
-    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
-        &in, out, out_reduce_dims, true, false, ctx)
-        .template apply<T>();
-    out->Resize(phi::make_ddim(out_dims));
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 87aa5dcde62..1f95e121271 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -46,8 +46,6 @@ if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack)
-add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/fluid/platform/dynload/lapack.cc
deleted file mode 100644
index 5a21bb4d041..00000000000
--- a/paddle/fluid/platform/dynload/lapack.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/lapack.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-LAPACK_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
deleted file mode 100644
index 59e04dbd2a1..00000000000
--- a/paddle/fluid/platform/dynload/lapack.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <complex>
-#include <mutex>
-#include "paddle/phi/backends/dynload/lapack.h"
-#include "paddle/phi/common/complex.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                     \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
-  DYNAMIC_LOAD_LAPACK_WRAP(__name)
-
-#define LAPACK_ROUTINE_EACH(__macro) \
-  __macro(dgetrf_);                  \
-  __macro(sgetrf_);                  \
-  __macro(zheevd_);                  \
-  __macro(cheevd_);                  \
-  __macro(dsyevd_);                  \
-  __macro(ssyevd_);                  \
-  __macro(dgeev_);                   \
-  __macro(sgeev_);                   \
-  __macro(zgeev_);                   \
-  __macro(cgeev_);                   \
-  __macro(dgels_);                   \
-  __macro(sgels_);                   \
-  __macro(dgelsd_);                  \
-  __macro(sgelsd_);                  \
-  __macro(dgelsy_);                  \
-  __macro(sgelsy_);                  \
-  __macro(dgelss_);                  \
-  __macro(sgelss_);                  \
-  __macro(zpotrs_);                  \
-  __macro(cpotrs_);                  \
-  __macro(dpotrs_);                  \
-  __macro(spotrs_);
-
-LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
-
-#undef DYNAMIC_LOAD_LAPACK_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index 75fc8fd9a3c..c81c66c6928 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
+// Because lapack doesn't provide appropriate header file,
+// we should expose API statement yourself.
 
 // getrf_(For example)
 extern "C" void dgetrf_(
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index ff73829c475..641956c4d9d 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -274,6 +274,60 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input Y must greater or equal to 2"));
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input X must greater or equal to 2"));
+  PADDLE_ENFORCE_EQ(
+      y_dims[y_dims_n - 1],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("input Matrix Y should be square matrix,"
+                                   "But Got last shape of %ld x %ld",
+                                   y_dims[y_dims_n - 1],
+                                   y_dims[y_dims_n - 2]));
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_dims_n - 2],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("the first dim of Matrix X must be equal to "
+                                   "the fisrt dim of Matrix Y,"
+                                   "But Got %ld and %ld",
+                                   x_dims[x_dims_n - 2],
+                                   y_dims[y_dims_n - 2]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_broadcast_dims({expand_batch_portion});
+  x_broadcast_dims.insert(x_broadcast_dims.end(),
+                          {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]});
+
+  // dim of 'out' is the same with 'X' after broadcast
+  out->set_dims(phi::make_ddim(x_broadcast_dims));
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
 void TriangularSolveInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               bool upper,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index bc5cb887f2a..d2b16e557b0 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -62,6 +62,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out);
+
 void TriangularSolveInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               bool upper,
diff --git a/paddle/phi/kernels/cholesky_solve_grad_kernel.h b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
new file mode 100644
index 00000000000..e2ce67abae6
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_solve_kernel.h b/paddle/phi/kernels/cholesky_solve_kernel.h
new file mode 100644
index 00000000000..b304a20e611
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
new file mode 100644
index 00000000000..b6f5dd29ba2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
new file mode 100644
index 00000000000..02597560a7f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+class CholeskySolveFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    char uplo = upper ? 'U' : 'L';
+    funcs::lapackCholeskySolve<T>(uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky_solve, CPU, ALL_LAYOUT, phi::CholeskySolveKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
index ffff5ae8abe..1a53470b2e6 100644
--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
@@ -1 +1 @@
-math_library(lapack_function DEPS dynload_lapack)
+math_library(lapack_function DEPS phi_dynload_lapack)
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 0407b8fd489..0f887dce4b4 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/fluid/platform/dynload/lapack.h"
+#include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
@@ -22,12 +22,12 @@ namespace funcs {
 // LU (for example)
 template <>
 void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 template <>
 void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 // eigh
@@ -47,7 +47,7 @@ void lapackEigh<float>(char jobz,
                        int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::ssyevd_(
+  dynload::ssyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -67,7 +67,7 @@ void lapackEigh<double>(char jobz,
                         int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::dsyevd_(
+  dynload::dsyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -86,20 +86,19 @@ void lapackEigh<phi::dtype::complex<float>, float>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::cheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::cheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<float> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 template <>
@@ -117,20 +116,19 @@ void lapackEigh<phi::dtype::complex<double>, double>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::zheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::zheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<double> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 // Eig
@@ -152,20 +150,20 @@ void lapackEig<double>(char jobvl,
   double *wr = w;
   double *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::dgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::dgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -186,20 +184,20 @@ void lapackEig<float>(char jobvl,
   float *wr = w;
   float *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::sgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::sgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -218,21 +216,20 @@ void lapackEig<phi::dtype::complex<double>, double>(
     int lwork,
     double *rwork,
     int *info) {
-  paddle::platform::dynload::zgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(w),
-      reinterpret_cast<std::complex<double> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<double> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::zgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<double> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<double> *>(w),
+                  reinterpret_cast<std::complex<double> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<double> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<double> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -251,21 +248,20 @@ void lapackEig<phi::dtype::complex<float>, float>(
     int lwork,
     float *rwork,
     int *info) {
-  paddle::platform::dynload::cgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<float> *>(w),
-      reinterpret_cast<std::complex<float> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<float> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::cgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<float> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<float> *>(w),
+                  reinterpret_cast<std::complex<float> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<float> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<float> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -280,8 +276,7 @@ void lapackGels<double>(char trans,
                         double *work,
                         int lwork,
                         int *info) {
-  paddle::platform::dynload::dgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -296,8 +291,7 @@ void lapackGels<float>(char trans,
                        float *work,
                        int lwork,
                        int *info) {
-  paddle::platform::dynload::sgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -316,20 +310,20 @@ void lapackGelsd<double>(int m,
                          double *rwork,
                          int *iwork,
                          int *info) {
-  paddle::platform::dynload::dgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::dgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -348,20 +342,20 @@ void lapackGelsd<float>(int m,
                         float *rwork,
                         int *iwork,
                         int *info) {
-  paddle::platform::dynload::sgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::sgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -379,7 +373,7 @@ void lapackGelsy<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelsy_(
+  dynload::dgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -398,7 +392,7 @@ void lapackGelsy<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelsy_(
+  dynload::sgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -417,7 +411,7 @@ void lapackGelss<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelss_(
+  dynload::dgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -436,7 +430,7 @@ void lapackGelss<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelss_(
+  dynload::sgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -450,15 +444,14 @@ void lapackCholeskySolve<phi::dtype::complex<double>>(
     phi::dtype::complex<double> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::zpotrs_(
-      &uplo,
-      &n,
-      &nrhs,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(b),
-      &ldb,
-      info);
+  dynload::zpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<double> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -471,14 +464,14 @@ void lapackCholeskySolve<phi::dtype::complex<float>>(
     phi::dtype::complex<float> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::cpotrs_(&uplo,
-                                     &n,
-                                     &nrhs,
-                                     reinterpret_cast<std::complex<float> *>(a),
-                                     &lda,
-                                     reinterpret_cast<std::complex<float> *>(b),
-                                     &ldb,
-                                     info);
+  dynload::cpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<float> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -490,7 +483,7 @@ void lapackCholeskySolve<double>(char uplo,
                                  double *b,
                                  int ldb,
                                  int *info) {
-  paddle::platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 template <>
@@ -502,7 +495,7 @@ void lapackCholeskySolve<float>(char uplo,
                                 float *b,
                                 int ldb,
                                 int *info) {
-  paddle::platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
new file mode 100644
index 00000000000..82b1282cc36
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// backward reuse forward, HIP not support forward
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
new file mode 100644
index 00000000000..f1c91f38247
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+void cusolver_potrs(const solverHandle_t &handle,
+                    cublasFillMode_t uplo,
+                    int M,
+                    int N,
+                    T *Adata,
+                    int lda,
+                    T *Bdata,
+                    int ldb,
+                    int *devInfo);
+
+template <>
+void cusolver_potrs<float>(const solverHandle_t &handle,
+                           cublasFillMode_t uplo,
+                           int M,
+                           int N,
+                           float *Adata,
+                           int lda,
+                           float *Bdata,
+                           int ldb,
+                           int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<double>(const solverHandle_t &handle,
+                            cublasFillMode_t uplo,
+                            int M,
+                            int N,
+                            double *Adata,
+                            int lda,
+                            double *Bdata,
+                            int ldb,
+                            int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<float>>(
+    const solverHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<float> *Adata,
+    int lda,
+    phi::dtype::complex<float> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCpotrs(handle,
+                                uplo,
+                                M,
+                                N,
+                                reinterpret_cast<const cuComplex *>(Adata),
+                                lda,
+                                reinterpret_cast<cuComplex *>(Bdata),
+                                ldb,
+                                devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<double>>(
+    const cusolverDnHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<double> *Adata,
+    int lda,
+    phi::dtype::complex<double> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
+      handle,
+      uplo,
+      M,
+      N,
+      reinterpret_cast<const cuDoubleComplex *>(Adata),
+      lda,
+      reinterpret_cast<cuDoubleComplex *>(Bdata),
+      ldb,
+      devInfo));
+}
+
+template <typename T>
+class CholeskySolveFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    cublasFillMode_t uplo =
+        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+    auto handle = dev_ctx.cusolver_dn_handle();
+    cusolver_potrs<T>(handle, uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
new file mode 100644
index 00000000000..9f557e74637
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  // Tensor broadcast to temp 'y_bst'
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // reuse forward to calculate dx_bst, which is broad_cast of dx
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  CholeskySolveKernel<T, Context>(dev_ctx, dout, y_bst, upper, &dx_bst);
+
+  // get 'dx' according to 'dx_bst'
+  dx->Resize(x.dims());
+  dev_ctx.template Alloc<T>(dx);
+  if (dx_bst.dims() == x.dims()) {
+    Copy<Context>(dev_ctx, dx_bst, dev_ctx.GetPlace(), false, dx);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dx_bst, dx);
+    dx->Resize(x.dims());
+  }
+
+  // calculate out's conjugate for complex
+  DenseTensor out_conj = Conj<T, Context>(dev_ctx, out);
+  out_conj = phi::TransposeLast2Dim<T>(dev_ctx, out_conj);
+
+  DenseTensor commonterm = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(dx_bst,
+              phi::funcs::CreateMatrixDescriptor(dx_bst.dims(), 0, false),
+              out_conj,
+              phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false),
+              static_cast<T>(1),
+              &commonterm,
+              static_cast<T>(0));
+
+  // calculate commonterm's conjugate for complex
+  DenseTensor commonterm_conj = Conj<T, Context>(dev_ctx, commonterm);
+  commonterm_conj = phi::TransposeLast2Dim<T>(dev_ctx, commonterm_conj);
+
+  phi::AddRawKernel<T>(dev_ctx, commonterm, commonterm_conj, -1, &commonterm);
+
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  if (upper) {
+    blas.MatMul(y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  } else {
+    blas.MatMul(commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  }
+
+  // get upper or lower of 'dy_bst'
+  DenseTensor dy_bst_upper = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+
+  int y_bst_ndim = y_bst_dims_vec.size();
+  const auto H = y_bst_dims_vec[y_bst_ndim - 2];
+  const auto W = y_bst_dims_vec[y_bst_ndim - 1];
+  phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
+  paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+      dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
+  y_for_range(tril_triu_functor);
+
+  // get 'dy' according to 'dy_bst'
+  dy->Resize(y.dims());
+  dev_ctx.template Alloc<T>(dy);
+  if (dy_bst_upper.dims() == y.dims()) {
+    Copy<Context>(dev_ctx, dy_bst_upper, dev_ctx.GetPlace(), false, dy);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dy_bst_upper, dy);
+    dy->Resize(y.dims());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
new file mode 100644
index 00000000000..16ceb776f1a
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+class CholeskySolveFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T* Adata,
+                  int lda,
+                  T* Bdata,
+                  int* devInfo);
+};
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // Tensor broadcast to temp 'x_bst' and 'y_bst'
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  // calculate y_bst's conjugate for complex
+  DenseTensor y_bst_conj = Conj<T, Context>(dev_ctx, y_bst);
+  y_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, y_bst_conj);
+  T* y_bst_conj_data = y_bst_conj.data<T>();
+
+  // calculate x_bst's conjugate for complex
+  DenseTensor x_bst_conj = Conj<T, Context>(dev_ctx, x_bst);
+  x_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, x_bst_conj);
+
+  // copy x_bst's conjugate to 'result'
+  DenseTensor result;
+  Copy<Context>(dev_ctx, x_bst_conj, dev_ctx.GetPlace(), false, &result);
+  T* res_data = result.data<T>();
+
+  // CPU use lapack, GPU use cusolver
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int M = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 2]);
+  int N = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 1]);
+  int batchsize = product(phi::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2));
+
+  DenseTensor info =
+      phi::Empty<int, Context>(dev_ctx, ScalarArray({batchsize}));
+  int* info_data = info.data<int>();
+
+  CholeskySolveFunctor<T, Context> functor;
+  for (int i = 0; i < batchsize; ++i) {
+    functor(dev_ctx,
+            upper,
+            M,
+            N,
+            y_bst_conj_data + i * M * M,
+            std::max(1, M),
+            res_data + i * M * N,
+            info_data + i);
+  }
+
+  // calculate out's conjugate for complex
+  result = phi::TransposeLast2Dim<T>(dev_ctx, result);
+  out->Resize(phi::make_ddim(x_bst_dims_vec));
+  ConjKernel<T, Context>(dev_ctx, result, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/cholesky_solve_sig.cc b/paddle/phi/ops/compat/cholesky_solve_sig.cc
new file mode 100644
index 00000000000..6a9759f8352
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskySolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky_solve_grad,
+                           phi::CholeskySolveGradOpArgumentMapping);
-- 
GitLab


From 1882c49630678e3f89aaad0691e18b198b11abb5 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Fri, 11 Mar 2022 17:20:28 +0800
Subject: [PATCH 263/272] [hybrid] Support tensor parallel and cache structure
 for fused attention op. (#40101)

---
 paddle/fluid/operators/fused/fmha_ref.h       |  39 ++-
 .../operators/fused/fused_attention_op.cc     |  81 ++++-
 .../operators/fused/fused_attention_op.cu     |  69 +++-
 paddle/fluid/pybind/op_function_generator.h   |  19 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../static_model_parallel_fused_attention.py  | 297 ++++++++++++++++++
 .../unittests/test_fused_attention_op.py      | 159 ++++------
 ...t_static_model_parallel_fused_attention.py |  45 +++
 .../nn/functional/fused_transformer.py        |  44 ++-
 9 files changed, 608 insertions(+), 147 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py

diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 3c9e16785ea..54e4cbdc162 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -69,20 +70,21 @@ class FMHARef {
   ~FMHARef() {}
 
   void ComputeForward(const Tensor& qkv_input_tensor,
+                      const Tensor* cache_kv_tensor,
                       const Tensor* src_mask_tensor,
-                      Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor,
+                      Tensor* transpose_2_out_tensor,
+                      Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor,
                       Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
                       Tensor* dropout_mask_out_tensor,
                       Tensor* dropout_out_tensor, Tensor* qktv_out_tensor,
                       Tensor* fmha_out_tensor) {
     // input shape: [bs, seq_len, 3, num_head, head_dim]
-    // transpose with perm [2, 0, 1, 3, 4],
+    // transpose with perm [2, 0, 3, 1, 4],
     // output_shape: [3, bs, num_head, seq_len, head_dim]
     int ndims = 5;
     std::vector<int> perm_1 = {2, 0, 3, 1, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_input_tensor, perm_1,
                                 transpose_2_out_tensor);
-
     T* qkv_data = transpose_2_out_tensor->data<T>();
     T* qk_out_data = qk_out_tensor->data<T>();
     T* qktv_out_data = qktv_out_tensor->data<T>();
@@ -90,11 +92,30 @@ class FMHARef {
     T* dropout_out_data = dropout_out_tensor->data<T>();
     T* fmha_out_data = fmha_out_tensor->data<T>();
 
-    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
-    int k_size = q_size;
+    auto out_seq_len = seq_len_;
+    if (cache_kv_tensor) {
+      // kv [2, bs, num_head, seq_len, head_dim]
+      auto kv_tensor = transpose_2_out_tensor->Slice(1, 3);
+      phi::funcs::ConcatFunctor<phi::GPUContext, T> concat;
+      // out [2, bs, num_head, cache_seq_len + seq_len, head_dim]
+      concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor);
+      out_seq_len = cache_kv_out_tensor->dims()[3];
+    }
+
+    int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     T* q_ptr = qkv_data;
-    T* k_ptr = q_ptr + q_size;
-    T* v_ptr = k_ptr + k_size;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+
+    if (cache_kv_tensor) {
+      int64_t k_size = cache_kv_out_tensor->numel() / 2;
+      k_ptr = cache_kv_out_tensor->data<T>();
+      v_ptr = k_ptr + k_size;
+    } else {
+      int64_t k_size = q_size;
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + k_size;
+    }
 
     // q*k^t, batched_gemm
     CBLAS_TRANSPOSE transA = CblasNoTrans;
@@ -102,7 +123,7 @@ class FMHARef {
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     int gemm_batch_size = batch_size_ * num_head_;
     int gemm_m = seq_len_;
-    int gemm_n = seq_len_;
+    int gemm_n = out_seq_len;
     int gemm_k = head_dim_;
     T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
     T beta = static_cast<T>(0.0);
@@ -133,7 +154,7 @@ class FMHARef {
     transB = CblasNoTrans;
     gemm_m = seq_len_;
     gemm_n = head_dim_;
-    gemm_k = seq_len_;
+    gemm_k = out_seq_len;
     alpha = static_cast<T>(1.0);
     stride_a = gemm_m * gemm_k;
     stride_b = gemm_k * gemm_n;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index d141800d61c..e473f8ff066 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut",
                    "FusedAttentionOp");
 
+    if (ctx->HasInput("CacheKV")) {
+      OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut",
+                     "FusedAttentionOp");
+    }
     if (ctx->HasInput("SrcMask")) {
       OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut",
                      "FusedAttentionOp");
@@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                           "input qkv_weight = [%s]",
                           x_dim, y_dim));
 
-    PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
-                      platform::errors::InvalidArgument(
-                          "The dimensions of qkv_weight must be 4"
-                          "(3, num_head, dim_head, dim_embed),"
-                          "and must satisfy the limitations: "
-                          "(num_head * dim_head == dim_embed)"));
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
+                        platform::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(3, num_head, dim_head, dim_embed),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+    }
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
       ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
@@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // [3, batch_size, num_head, seq_len, head_size]
     ctx->SetOutputDim("TransposeOut2",
                       {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
-    // [batch, num_head, seq_len, seq_len]
-    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+
+    // cache_seq_len + seq_len if cache else seq_len
+    auto out_seq_len = x_dim[1];
+    if (ctx->HasInput("CacheKV")) {
+      // [2, batch_size, num_head, cache_seq_len, head_size]
+      auto c_dim = ctx->GetInputDim("CacheKV");
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(), 5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0], 2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0], c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            y_dim[1], c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GE(
+          c_dim[3], 0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            y_dim[2], c_dim[4]));  // head_size
+
+      out_seq_len += c_dim[3];
+      // [3, batch_size, num_head, cache_seq_len + seq_len, head_size]
+      ctx->SetOutputDim("CacheKVOut",
+                        {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]});
+    }
+
+    // [batch, num_head, seq_len, out_seq_len]
+    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
 
     if (ctx->HasInput("SrcMask")) {
-      ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+      ctx->SetOutputDim("SrcMaskOut",
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
-                      {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
-                        {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
-    ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+    ctx->SetOutputDim("SoftmaxOut",
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     // [batch_size, num_heads, seq_len, head_dim]
     ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
     // [batch_size, seq_len, number of heads*head size]
@@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddInput("QKVW", "The qkv weight tensor.");
     AddInput("QKVBias", "The qkv bias tensor.").AsDispensable();
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable();
     AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
         .AsDispensable();
     AddInput("OutLinearW", "The out_linear weight tensor.");
@@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("BiasDropoutResidualOut",
               "Result of residual + dropout(src + bias).")
         .AsIntermediate();
+    AddOutput("CacheKVOut", "The udpated cache KV.");
     AddOutput("Y", "Result after attention.");
 
     AddAttr<bool>("pre_layer_norm",
@@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "0.0 and 0.001, But received [%s].",
                                 ln_epsilon));
         });
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
 
     AddComment(R"DOC(
   Add fused attention op whose logic is as follows:
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 03f51fc5857..d26577f06fe 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -27,11 +27,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor &tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext &ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void *sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void *recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename T>
 class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
@@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     auto *src_mask = ctx.Input<Tensor>("SrcMask");
     auto *transpose_out_2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *cache_kv = ctx.Input<Tensor>("CacheKV");
+    auto *cache_kv_out = ctx.Output<Tensor>("CacheKVOut");
     auto *qk_out = ctx.Output<Tensor>("QKOut");
     auto *qktv_out = ctx.Output<Tensor>("QKTVOut");
     auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
@@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // final output.
     auto *out = ctx.Output<Tensor>("Y");
@@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // get data ptr for FMHA.
     auto *transpose_out_2_data =
         transpose_out_2->mutable_data<T>(ctx.GetPlace());
+    auto *cache_kv_out_data =
+        (cache_kv_out == nullptr)
+            ? nullptr
+            : cache_kv_out->mutable_data<T>(ctx.GetPlace());
     auto *qk_out_data = qk_out->mutable_data<T>(ctx.GetPlace());
     auto *qktv_out_data = qktv_out->mutable_data<T>(ctx.GetPlace());
     auto *src_mask_out_data =
@@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     output_size = hidden_size;
     // (transA, transB, compute_bias) = (false, false, false)
+    // NOTE(Yuang Liu): For general input size == output size, change the
+    // position won't have effects. For mp, the output size is mp_head * dkey
+    // which is actually the input size. While the input size is hidden size,
+    // which is actually the output size. So for out linear, switch the
+    // input size and output size.
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), false, false, bsz_seq,
-                      output_size, input_size, false);
+                      input_size, output_size, false);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
                                  qkv_bias_out);
     }
     if (qkv_bias == nullptr) {
-      fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out,
+          src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out,
+          qktv_out, fmha_out);
     } else {
-      fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out,
+          qk_out, src_mask_out, softmax_out, attn_dropout_mask_out,
+          attn_dropout_out, qktv_out, fmha_out);
     }
 
     // fmha_out: [batch_size, seq_len, num_head, head_dim]
@@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // out_linear_out: [batch_size, seq_len, embed_dim]
     out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr,
                                       out_linear_out, nullptr);
+    // tensor model parallel
+    AllReduce<T>(*out_linear_out, ring_id, ctx.cuda_device_context());
+
     if (pre_layer_norm) {
       // output = (residual + dropout(input + bias))
       fused_dropout_layernorm_helper.ResidualDropoutBias(
@@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // get inputs.
     auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
@@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     transA = false;
     transB = false;
     bool compute_bias = false;
+    // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed)
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), transA, transB, bsz_seq,
-                      output_size, input_size, compute_bias);
+                      input_size, output_size, compute_bias);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_ln_out, ring_id, ctx.cuda_device_context());
       layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data,
                                          ln_mean_data, ln_var_data, d_x_data,
                                          d_ln_scale_data, d_ln_bias_data);
@@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx.cuda_device_context());
     }
     // gradient accumulation
     std::vector<const Tensor *> ins;
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index d23b3dd64ab..9e86e3df8a6 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -30,8 +30,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
     {"bincount", {"X", "Weights"}},
     {"fused_attention",
-     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
-      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
+      "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -104,11 +104,16 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
-    {"fused_attention",
-     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
-      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
-      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
-      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
+    {"fused_attention", {"LnMean",         "LnVariance",
+                         "LnOut",          "QKVOut",
+                         "QKVBiasOut",     "TransposeOut2",
+                         "QKOut",          "QKTVOut",
+                         "SoftmaxOut",     "AttnDropoutMaskOut",
+                         "AttnDropoutOut", "SrcMaskOut",
+                         "FMHAOut",        "OutLinearOut",
+                         "DropoutMaskOut", "Ln2Mean",
+                         "Ln2Variance",    "BiasDropoutResidualOut",
+                         "CacheKVOut",     "Y"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9b0c857576b..e75b8d1f60b 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -24,6 +24,7 @@ list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_attention)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -1155,6 +1156,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
         set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel_fused_attention PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
new file mode 100644
index 00000000000..b57f2677623
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+import paddle.incubate.nn.functional as incubate_f
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import core
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedMultiHeadAttention(Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout_rate=0.5,
+                 attn_dropout_rate=0.5,
+                 kdim=None,
+                 vdim=None,
+                 normalize_before=False,
+                 need_weights=False,
+                 qkv_weight_attr=None,
+                 qkv_bias_attr=None,
+                 linear_weight_attr=None,
+                 linear_bias_attr=None,
+                 pre_ln_scale_attr=None,
+                 pre_ln_bias_attr=None,
+                 ln_scale_attr=None,
+                 ln_bias_attr=None,
+                 epsilon=1e-5,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedMultiHeadAttention, self).__init__()
+
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+                               "but recieved {}".format(num_heads))
+
+        self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.kdim = kdim
+        self.vdim = vdim
+        self.need_weights = need_weights
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert need_weights == False, "Only support need_weight is False now."
+
+        # tensor model parallel
+        assert num_heads % nranks == 0
+        num_heads = num_heads // nranks
+
+        self.qkv_weight = self.create_parameter(
+            shape=[3, num_heads, self.head_dim, embed_dim],
+            attr=qkv_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.qkv_bias = self.create_parameter(
+            shape=[3, num_heads, self.head_dim],
+            attr=qkv_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.linear_weight = self.create_parameter(
+            shape=[num_heads * self.head_dim, embed_dim],
+            attr=linear_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self.qkv_weight)
+            _set_var_distributed(self.qkv_bias)
+            # row parallel
+            _set_var_distributed(self.linear_weight)
+
+        if normalize_before:
+            self.pre_ln_scale = self.create_parameter(
+                attr=pre_ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.ln_scale = None
+            self.ln_bias = None
+        else:
+            self.pre_ln_scale = None
+            self.pre_ln_bias = None
+            self.ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+
+        self.dropout_rate = dropout_rate
+        self.attn_dropout_rate = attn_dropout_rate
+
+        self.name = name
+
+    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+        out = incubate_f.fused_multi_head_attention(
+            x=query,
+            qkv_weight=self.qkv_weight,
+            linear_weight=self.linear_weight,
+            pre_layer_norm=self.normalize_before,
+            pre_ln_scale=self.pre_ln_scale,
+            pre_ln_bias=self.pre_ln_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            pre_ln_epsilon=self._epsilon,
+            qkv_bias=self.qkv_bias,
+            linear_bias=self.linear_bias,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_rate,
+            attn_dropout_rate=self.attn_dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+n_head = 2 * MODEL_PARALLEL_SIZE
+d_key = 4
+hidden = n_head * d_key
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    pre_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    pre_ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    qkv_w = np.random.uniform(
+        -1, 1, size=(3, n_head, d_key, hidden)).astype(DTYPE)
+    qkv_b = np.random.uniform(-1, 1, size=(3, n_head, d_key)).astype(DTYPE)
+    linear_w = np.random.uniform(
+        -1, 1, size=(n_head * d_key, hidden)).astype(DTYPE)
+    linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else n_head // MODEL_PARALLEL_SIZE
+        end = start + n_head // MODEL_PARALLEL_SIZE
+        col_qkv_w = qkv_w[:, start:end, :, :]
+        col_qkv_b = qkv_b[:, start:end, :]
+        row_linear_w = linear_w[(start * d_key):(end * d_key), :]
+
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        result = attn(data)
+    else:
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr)
+        result = attn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, hidden]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 443703aa937..a3ae2a20dba 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -70,10 +70,12 @@ class TestFusedAttentionOp(OpTest):
         self.attn_mask_type = np.float64
         self.pre_layer_norm = False
         self.has_attn_mask = True
+        self.has_cache_kv = False
         self.training = True
 
         self.batch_size = 8
         self.query_length = 128
+        self.cache_length = 128
         self.head_dim = 64
         self.num_heads = 16
         self.embed_dim = self.head_dim * self.num_heads
@@ -88,10 +90,22 @@ class TestFusedAttentionOp(OpTest):
     def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
+        out_seq_len = self.key_length
+        if self.has_cache_kv:
+            assert self.training is False, ValueError(
+                'cache_kv can only used in inference')
+            self.cache_kv = np.random.rand(2, self.batch_size, self.num_heads,
+                                           self.cache_length,
+                                           self.head_dim).astype(self.x_type)
+            out_seq_len += self.cache_length
+        else:
+            self.cache_kv = None
+
         if self.has_attn_mask:
+            # [B, n_head, seq_len, out_seq_len]
             self.attn_mask = np.ones(
                 (self.batch_size, self.num_heads, self.query_length,
-                 self.key_length),
+                 out_seq_len),
                 dtype=self.attn_mask_type)
             if self.attn_mask_type == np.int64:
                 self.attn_mask = np.tril(self.attn_mask)
@@ -110,6 +124,11 @@ class TestFusedAttentionOp(OpTest):
     def GetBaselineOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
         tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
+
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
+
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -130,6 +149,18 @@ class TestFusedAttentionOp(OpTest):
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
         v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
+        if self.has_cache_kv:
+            # [1, B, n_head, cache_seq_len, head_dim]
+            cache_k, cache_v = paddle.split(cache_kv, 2)
+            cache_k = paddle.squeeze(cache_k, axis=0)
+            cache_v = paddle.squeeze(cache_v, axis=0)
+            # [B, n_head, cache_seq_len + seq_len, head_dim]
+            # out_seq_len = cache_seq_len + seq_len
+            k_out = paddle.concat([cache_k, k_out], axis=-2)
+            v_out = paddle.concat([cache_v, v_out], axis=-2)
+
+        # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
+        # --> [B, n_head, seq_len, out_seq_len]
         qk_out = layers.matmul(
             x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
 
@@ -146,6 +177,8 @@ class TestFusedAttentionOp(OpTest):
                 self.dropout_prob,
                 training=self.training,
                 mode="upscale_in_train")
+            # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
+            # --> [B, n_head, seq_len, head_dim]
             qktv_out = tensor.matmul(dropout_out, v_out)
         else:
             qktv_out = tensor.matmul(softmax_out, v_out)
@@ -160,6 +193,10 @@ class TestFusedAttentionOp(OpTest):
             final_out = self.norm1(residual_out)
         else:
             final_out = residual_out
+
+        if self.has_cache_kv:
+            return final_out
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, tensor_query.grad
@@ -206,6 +243,9 @@ class TestFusedAttentionOp(OpTest):
             (3, self.num_heads, self.head_dim, self.embed_dim))
 
         x = paddle.to_tensor(self.query, stop_gradient=False)
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -219,8 +259,12 @@ class TestFusedAttentionOp(OpTest):
         final_out = incubate_f.fused_multi_head_attention(
             x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm,
             ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor,
-            out_linear_bias, attn_mask, self.dropout_prob,
+            out_linear_bias, cache_kv, attn_mask, self.dropout_prob,
             self.attn_dropout_prob, ln2_epsilon)
+
+        if self.has_cache_kv:
+            return final_out[0], final_out[1]
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, x.grad
@@ -236,114 +280,27 @@ class TestFusedAttentionOp(OpTest):
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
+        super().config()
         self.bias_attr = False
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
         self.has_attn_mask = False
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
     def config(self):
+        super().config()
         self.x_type = np.float16
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
 
     def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
@@ -354,5 +311,21 @@ class TestFusedAttentionOpFp16(TestFusedAttentionOp):
             x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
 
 
+class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.training = False
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+
+    def test_fused_attention_op(self):
+        with paddle.no_grad():
+            final_out_ref = self.GetBaselineOut()
+            final_out, cache_kv_out = self.GetFusedAttentionOut()
+            np.testing.assert_allclose(
+                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
new file mode 100644
index 00000000000..e4ce8e8170f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_attention.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index d600cda8454..457422ae3a4 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -223,12 +223,14 @@ def fused_multi_head_attention(x,
                                pre_ln_epsilon=1e-05,
                                qkv_bias=None,
                                linear_bias=None,
+                               cache_kv=None,
                                attn_mask=None,
                                dropout_rate=0.5,
                                attn_dropout_rate=0.5,
                                ln_epsilon=1e-05,
                                training=True,
                                mode='upscale_in_train',
+                               ring_id=-1,
                                name=None):
     r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
@@ -242,8 +244,8 @@ def fused_multi_head_attention(x,
     	    out = layer_norm(x)
             out = linear(out) + qkv) + bias
     	else:
-	    out = linear(x) + bias
-    	out = transpose(out, perm=[2, 0, 3, 1, 4])
+            out = linear(x) + bias
+            out = transpose(out, perm=[2, 0, 3, 1, 4])
     	# extract q, k and v from out.
     	q = out[0:1,::]
     	k = out[1:2,::]
@@ -257,8 +259,8 @@ def fused_multi_head_attention(x,
     	out = out_linear(out)
     	if pre_layer_norm:
     	    out = x + dropout(linear_bias + out)
-	else:
-    	    out = layer_norm(x + dropout(linear_bias + out))
+        else:
+            out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -276,6 +278,7 @@ def fused_multi_head_attention(x,
         qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`.
             Default None.
         linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        cache_kv (Tensor, optional): For generation model, cache structure. The shape is `[2, bsz, num_head, seq_len, head_dim]`. Default None.
         attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to
  	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor
             with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the
@@ -303,6 +306,7 @@ def fused_multi_head_attention(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in mp, only support NCCL and forward. Default is -1, means not using mp
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -333,7 +337,7 @@ def fused_multi_head_attention(x,
             output = F.fused_multi_head_attention(
                 x, qkv_weight, linear_weight, False,
                 None, None, None, None, 1e-5, qkv_bias,
-                linear_bias, attn_mask)
+                linear_bias, None, attn_mask)
             # [2, 4, 128]
             print(output.shape)
     """
@@ -359,17 +363,20 @@ def fused_multi_head_attention(x,
         assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
             3], "embed_dim must be divisible by num_heads."
 
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, final_out = _C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, attn_mask,
-            linear_weight, linear_bias, ln_scale, ln_bias, 'pre_layer_norm',
-            pre_layer_norm, 'epsilon', pre_ln_epsilon, 'dropout_rate',
-            dropout_rate, 'attn_dropout_rate', attn_dropout_rate, 'ln_epsilon',
-            ln_epsilon, 'attn_dropout_is_test', not training, 'dropout_is_test',
-            not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
+        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _C_ops.fused_attention(
+            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
+            attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
+            'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
+            'dropout_rate', dropout_rate, 'attn_dropout_rate',
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
+            not training, 'dropout_is_test', not training,
+            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
+            seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
-            'dropout_implementation', mode)
+            'dropout_implementation', mode, 'ring_id', ring_id)
+        if cache_kv is not None:
+            return final_out, cache_kv_out
         return final_out
     else:
         helper = LayerHelper('fused_multi_head_attention', **locals())
@@ -398,6 +405,7 @@ def fused_multi_head_attention(x,
             inputs['Ln2Scale'] = [ln_scale]
         if ln_bias:
             inputs['Ln2Bias'] = [ln_bias]
+        if cache_kv: inputs['CacheKV'] = [cache_kv]
 
         if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
             seed = helper.main_program.random_seed
@@ -417,6 +425,7 @@ def fused_multi_head_attention(x,
             'dropout_seed': seed if seed is not None else 0,
             'attn_dropout_implementation': mode,
             'dropout_implementation': mode,
+            'ring_id': ring_id
         }
 
         # set outputs
@@ -449,6 +458,7 @@ def fused_multi_head_attention(x,
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
             dtype=dtype)
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
+        cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
         helper.append_op(
             type='fused_attention',
@@ -472,7 +482,9 @@ def fused_multi_head_attention(x,
                 "Ln2Mean": ln_mean_out,
                 "Ln2Variance": ln_variance_out,
                 "BiasDropoutResidualOut": bias_dropout_residual_out,
-                'Y': final_out
+                'Y': final_out,
+                'CacheKVOut': cache_kv_out
             },
             attrs=attrs)
-        return final_out
+
+        return (final_out, cache_kv_out) if cache_kv else final_out
-- 
GitLab


From 89ed57e278f48c0856150ca3b0320f2d90cdcd6a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 11 Mar 2022 17:20:38 +0800
Subject: [PATCH 264/272] [Phi] Remove needless deps in unittests (#40256)

* remove needless deps in unittests

* add gpu marco

* fix other unittests

* fix kernel name error

* fix test_prepare_op

* fix failed dygraph unittests

* fix gpu failed tests

* fix cinn test failed

* fix cinn test failed

* fix dropout tests
---
 .../test/compute_interceptor_run_op_test.cc   |  4 ++
 .../grad_tensor_holder_test.cc                |  2 +
 .../performance_tests/benchmark_eager_cpu.cc  |  8 ++++
 .../performance_tests/benchmark_eager_cuda.cc |  8 ++++
 .../performance_tests/benchmark_fluid_cpu.cc  |  8 ++++
 .../performance_tests/benchmark_fluid_cuda.cc |  8 ++++
 .../eager/tests/task_tests/backward_test.cc   |  4 ++
 .../cross_batch_accumulation_test.cc          |  4 ++
 .../tests/task_tests/forward_autograd_test.cc |  4 ++
 .../tests/task_tests/fwd_bwd_joint_test.cc    |  7 +++
 .../eager/tests/task_tests/generated_test.cc  |  6 +++
 .../fluid/eager/tests/task_tests/hook_test.cc |  4 ++
 .../task_tests/hook_test_intermidiate.cc      |  6 +++
 .../tests/task_tests/tensor_utils_test.cc     |  4 ++
 ...est_reference_count_pass_last_lived_ops.cc |  3 ++
 paddle/fluid/imperative/tests/test_hooks.cc   |  4 ++
 .../fluid/imperative/tests/test_prepare_op.cc |  7 +++
 paddle/fluid/imperative/tests/test_tracer.cc  |  7 +++
 .../cinn/cinn_instruction_run_op_test.cc      |  6 +++
 .../operators/cinn/cinn_launch_op_test.cc     |  6 +++
 .../test_elementwise_add_grad_grad.cc         |  5 +++
 .../test_elementwise_add_op_inplace.cc        |  3 ++
 .../test_elementwise_div_grad_grad.cc         |  6 +++
 paddle/fluid/operators/feed_forward_test.cu   |  6 +++
 .../fused/fused_dropout_act_bias_test.cu      |  6 +++
 ...ed_layernorm_residual_dropout_bias_test.cu |  6 +++
 .../fused/fused_residual_dropout_bias_test.cu |  6 +++
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   |  3 ++
 .../fluid/operators/op_debug_string_test.cc   |  2 +
 paddle/phi/tests/api/CMakeLists.txt           | 45 ++++++++++---------
 paddle/phi/tests/api/test_cast_api.cc         |  3 ++
 paddle/phi/tests/api/test_concat_api.cc       |  2 +
 paddle/phi/tests/api/test_conj_api.cc         |  2 +
 paddle/phi/tests/api/test_data_transform.cc   | 10 +++++
 paddle/phi/tests/api/test_dot_api.cc          |  2 +
 paddle/phi/tests/api/test_elementwise_api.cc  |  2 +
 paddle/phi/tests/api/test_empty_api.cc        |  2 +
 paddle/phi/tests/api/test_fill_api.cc         |  2 +
 paddle/phi/tests/api/test_flatten_api.cc      |  2 +
 paddle/phi/tests/api/test_matmul_api.cc       |  9 ++++
 paddle/phi/tests/api/test_mean_api.cc         |  2 +
 paddle/phi/tests/api/test_pten_tensor.cc      |  7 +++
 paddle/phi/tests/api/test_reshape_api.cc      |  3 ++
 paddle/phi/tests/api/test_scale_api.cc        |  5 +++
 paddle/phi/tests/api/test_scale_benchmark.cc  |  3 ++
 paddle/phi/tests/api/test_slice_api.cc        |  2 +
 paddle/phi/tests/api/test_sparse_conv_api.cc  |  2 +
 paddle/phi/tests/api/test_sparse_utils_api.cc |  2 +
 paddle/phi/tests/api/test_split_api.cc        |  2 +
 paddle/phi/tests/api/test_sum_api.cc          |  2 +
 paddle/phi/tests/api/test_to_api.cc           |  5 +++
 python/paddle/utils/code_gen/api_gen.py       |  1 -
 .../paddle/utils/code_gen/sparse_api_gen.py   |  1 -
 .../utils/code_gen/sparse_bw_api_gen.py       |  1 -
 54 files changed, 247 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 18920d06f38..ba039385a74 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -24,10 +24,14 @@ limitations under the License. */
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(fill_constant);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 8c6eeca9d3d..384fdcd6f97 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -24,6 +24,8 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
+
 // TODO(jiabin): remove nolint here!!!
 using namespace egr;  // NOLINT
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 6c4bf9a4f17..af365322e60 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -33,6 +33,14 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 9f59f4fc030..5b75f1242e6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -32,11 +32,19 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+
 TEST(Benchmark, EagerScaleCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 3292de93636..a9d297c1c64 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -34,6 +34,14 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index df77fc1360b..bd9eaa09ca9 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -34,8 +34,16 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index a4bc56bd606..0c894ed267f 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,6 +30,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Backward, SingleNodeEmptyGrad) {
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 524872b2e55..36594f1aac8 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -31,6 +31,10 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(CrossBatchAccumulation, SingleScaleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 49bbfc77741..dc44d95daac 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -27,6 +27,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Forward, SingleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 5a7bafb2fe3..f7fa642ea8d 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -30,6 +30,13 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+#endif
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 4b7077b13bd..2a5ad53204a 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -30,6 +30,12 @@
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Generated, Sigmoid) {
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 9cda961741f..d546df4ed08 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -31,6 +31,10 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 15b2a62dca7..56813c498d2 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -27,6 +27,12 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index ea821d19509..24e5da06011 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -23,6 +23,10 @@
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(TensorUtils, Test) {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index d33dc7f49fe..636a594a657 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -20,12 +20,15 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(elementwise_add_grad);
 
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+
 DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 3ac20287906..02a1689c23a 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -24,6 +24,10 @@
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 17cbe067482..4cda3f32fdf 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -24,6 +24,13 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 0696de908a9..2e38bd77cf6 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -28,6 +28,13 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 2afee35112e..0edbee534c0 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -22,11 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 460d417e61f..585f1caabed 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -26,12 +26,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 DECLARE_double(eager_delete_tensor_gb);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index 5222103256d..ea009a38056 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -17,8 +17,13 @@
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
+PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 9d4d11609ac..ce5c6b701d9 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -21,9 +21,12 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 7890d634e99..3cecc52a3c4 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -27,9 +27,15 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_div);
 
+PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 0eb84f18f25..27a23576522 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/attn_feed_forward.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,6 +30,11 @@ namespace platform = paddle::platform;
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+#endif
+
 // get paddle matmul op results as baseline
 template <typename T>
 void GetLinearOp(const std::vector<T> &x, const std::vector<T> &y,
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
index 2381b5b7fdf..717c1732b7b 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -20,8 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index cc14d0680d3..032440d7f04 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 /**
  * @brief The unit test of fused_layernorm_residual_dropout_bias
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 1a12e6b565f..5dff5e2225f 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 916f02179b3..9d0062e3138 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
@@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index b96fcaa486c..372a71706ab 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -17,8 +17,10 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add_grad);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index be12960d1d6..d998ab9435c 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,28 +1,29 @@
 if(WITH_ROCM)
-  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 else()
-  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 endif()
 
 cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
 
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils)
+set(COMMON_API_TEST_DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_split_api SRCS test_split_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/test_cast_api.cc b/paddle/phi/tests/api/test_cast_api.cc
index 276a70066ba..5448fb9d424 100644
--- a/paddle/phi/tests/api/test_cast_api.cc
+++ b/paddle/phi/tests/api/test_cast_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_concat_api.cc b/paddle/phi/tests/api/test_concat_api.cc
index d5a36f56bfa..824b72b97ac 100644
--- a/paddle/phi/tests/api/test_concat_api.cc
+++ b/paddle/phi/tests/api/test_concat_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(concat, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_conj_api.cc b/paddle/phi/tests/api/test_conj_api.cc
index 9c438de9297..62a588dff12 100644
--- a/paddle/phi/tests/api/test_conj_api.cc
+++ b/paddle/phi/tests/api/test_conj_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index a3c497bd427..dd008ff36d5 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -19,6 +19,16 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_dot_api.cc b/paddle/phi/tests/api/test_dot_api.cc
index e48004653d6..3fcd4e8a01d 100644
--- a/paddle/phi/tests/api/test_dot_api.cc
+++ b/paddle/phi/tests/api/test_dot_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index cebf4e003aa..d4013a788c7 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_empty_api.cc b/paddle/phi/tests/api/test_empty_api.cc
index dc5618f0aae..48adbe1bd26 100644
--- a/paddle/phi/tests/api/test_empty_api.cc
+++ b/paddle/phi/tests/api/test_empty_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index 9b434aef811..bf57574d390 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_flatten_api.cc b/paddle/phi/tests/api/test_flatten_api.cc
index e1360e8e27b..f1c8935e266 100644
--- a/paddle/phi/tests/api/test_flatten_api.cc
+++ b/paddle/phi/tests/api/test_flatten_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index 2a3dd9c7dff..e2c324a6775 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -26,6 +26,15 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_double_grad, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_mean_api.cc b/paddle/phi/tests/api/test_mean_api.cc
index 53be1b1e9dc..af47f2cd771 100644
--- a/paddle/phi/tests/api/test_mean_api.cc
+++ b/paddle/phi/tests/api/test_mean_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc
index dc2883c1794..74ed648f3ee 100644
--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -16,6 +16,13 @@
 #include "gtest/gtest.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_reshape_api.cc b/paddle/phi/tests/api/test_reshape_api.cc
index 60281a9f499..4a857e2d1dc 100644
--- a/paddle/phi/tests/api/test_reshape_api.cc
+++ b/paddle/phi/tests/api/test_reshape_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index 52e8ae630e0..a40ecc8485e 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -19,8 +19,13 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/selected_rows.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index 9c0b0fc576e..05a55633449 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/tests/api/scale_api.h"
 #include "paddle/phi/tests/core/timer.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_slice_api.cc b/paddle/phi/tests/api/test_slice_api.cc
index c3f5fdcb36d..ee2ade0229f 100644
--- a/paddle/phi/tests/api/test_slice_api.cc
+++ b/paddle/phi/tests/api/test_slice_api.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 16d7cb66f4c..76cb01d8a8b 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
+PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT);
+
 template <typename T>
 void TestConv3dBase(const std::vector<int>& indices,
                     const std::vector<T>& features,
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index 819122a9b36..8595782be35 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+
 TEST(API, to_sparse_coo) {
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 0b836a01058..1b84e7793cf 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sum_api.cc b/paddle/phi/tests/api/test_sum_api.cc
index 80620b8e61c..9781d70d2b9 100644
--- a/paddle/phi/tests/api/test_sum_api.cc
+++ b/paddle/phi/tests/api/test_sum_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index d337a0b601a..66c478e4c00 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -21,6 +21,11 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 058cc08465f..a404fc01784 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -147,7 +147,6 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/declarations.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index 3838ac01c74..dd22e16dc64 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -188,7 +188,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/declarations.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index ede4de2bdd6..561e198a41b 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -111,7 +111,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/declarations.h"
 """
 
 
-- 
GitLab


From c0e29233b4b9e54bccab478bbf2f7efcb80756af Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Fri, 11 Mar 2022 17:25:52 +0800
Subject: [PATCH 265/272] Move psroi_pool OP to phi (#40353)

* Move psroi_pool OP to phi

* Replace platform::TensorCopy with phi::Copy
---
 paddle/fluid/operators/psroi_pool_op.cc       | 107 +-----
 paddle/fluid/operators/psroi_pool_op.cu       | 350 ------------------
 paddle/fluid/operators/psroi_pool_op.h        | 295 ---------------
 paddle/phi/infermeta/backward.cc              |  12 +
 paddle/phi/infermeta/backward.h               |  10 +
 paddle/phi/infermeta/multiary.cc              | 259 ++++++++-----
 paddle/phi/infermeta/multiary.h               |  55 +--
 .../phi/kernels/cpu/psroi_pool_grad_kernel.cc | 140 +++++++
 paddle/phi/kernels/cpu/psroi_pool_kernel.cc   | 174 +++++++++
 .../phi/kernels/gpu/psroi_pool_grad_kernel.cu | 193 ++++++++++
 paddle/phi/kernels/gpu/psroi_pool_kernel.cu   | 231 ++++++++++++
 paddle/phi/kernels/psroi_pool_grad_kernel.h   |  34 ++
 paddle/phi/kernels/psroi_pool_kernel.h        |  33 ++
 paddle/phi/ops/compat/psroi_pool_sig.cc       |  40 ++
 14 files changed, 1079 insertions(+), 854 deletions(-)
 delete mode 100644 paddle/fluid/operators/psroi_pool_op.cu
 delete mode 100644 paddle/fluid/operators/psroi_pool_op.h
 create mode 100644 paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/psroi_pool_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/psroi_pool_kernel.cu
 create mode 100644 paddle/phi/kernels/psroi_pool_grad_kernel.h
 create mode 100644 paddle/phi/kernels/psroi_pool_kernel.h
 create mode 100644 paddle/phi/ops/compat/psroi_pool_sig.cc

diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index da637dfeb23..cfacffff234 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
 class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(ROIs) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of PSROIPoolOp should not be null."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], 4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_EQ(
-        input_dims[1], output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "the channel of X(%d) "
-            "should be equal to the product of "
-            "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-            input_dims[1], output_channels, pooled_height, pooled_width));
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      platform::errors::InvalidArgument(
-                          "The pooled output channels must greater than 1"));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of Out should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of X should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolGradInferMeta));
 REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
                   ops::PSROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool_grad,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>,
+                  PsroiPoolInferShapeFunctor);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp,
+                  PsroiPoolGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
deleted file mode 100644
index c1917501db8..00000000000
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T outsum = 0;
-
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        outsum += offset_input_data[input_index];
-      }
-    }
-
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    output_data[i] = is_empty ? 0. : outsum / bin_area;
-  }
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Accumulate diff_val into input data
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(
-        input_channels, output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "The channels %d of input X should equal the product of "
-            "output_channels %d x pooled_height %d x pooled_width %d.",
-            input_channels, output_channels, pooled_height, pooled_width));
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-    int rois_batch_size;
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                   rois_num_data, sizeof(int) * rois_batch_size, 0);
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_list[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-
-      // set rois batch id
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPSROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPSROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool_grad,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
deleted file mode 100644
index 3f020d93391..00000000000
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      platform::errors::InvalidArgument(
-                          "the channels of input "
-                          "X should equal the product of "
-                          "output_channels x pooled_height x pooled_width"));
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_data[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
-                                            "batch_size should be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and lod must be the same"));
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate psroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w =
-          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-      T roi_start_h =
-          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-      T roi_end_w =
-          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-      T roi_end_h =
-          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-      // Force too small rois to be 1 x 1
-      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-      // Compute bin size w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
-            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
-            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-            //  Add roi offsets and clip to input boundaries
-            hstart = std::min(std::max(hstart, 0), height);
-            wstart = std::min(std::max(wstart, 0), width);
-            hend = std::min(std::max(hend, 0), height);
-            wend = std::min(std::max(wend, 0), width);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T out_sum = 0.;
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            for (int ih = hstart; ih < hend; ++ih) {
-              for (int iw = wstart; iw < wend; ++iw) {
-                int input_index = ih * in_stride[2] + iw;
-                out_sum += offset_input_data[input_index];
-              }
-            }
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w =
-            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-        T roi_start_h =
-            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-        T roi_end_w =
-            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-        T roi_end_h =
-            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-        // Force too small ROIs to be 1x1
-        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-        // Add roi offsets and clip to input boundaries
-        hstart = std::min(std::max(hstart, 0), height);
-        hend = std::min(std::max(hend, 0), height);
-        wstart = std::min(std::max(wstart, 0), width);
-        wend = std::min(std::max(wend, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        // Accumulate diff_val into input data
-        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-        for (int ih = hstart; ih < hend; ++ih) {
-          for (int iw = wstart; iw < wend; ++iw) {
-            int input_index = ih * width + iw;
-            offset_input_grad_data[input_index] += diff_val;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 4ddef5b0002..0a2b4dcae58 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -115,6 +115,18 @@ void GatherNdGradInferMeta(const MetaTensor& x,
   x_grad->set_dtype(dtype);
 }
 
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
 void ScatterGradInferMeta(const MetaTensor& index,
                           const MetaTensor& updates,
                           const MetaTensor& out_grad,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index f7b0eed5dd9..c4003ca1fe7 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -47,6 +47,16 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 int axis,
                                 MetaTensor* dx);
 
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx);
+
 void ScatterGradInferMeta(const MetaTensor& index,
                           const MetaTensor& updates,
                           const MetaTensor& out_grad,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index acce40713b8..84441ed8b74 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -28,6 +28,98 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -108,98 +200,6 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
-void AdamaxInferMeta(const MetaTensor& param,
-                     const MetaTensor& grad,
-                     const MetaTensor& learning_rate,
-                     const MetaTensor& moment,
-                     const MetaTensor& inf_norm,
-                     const MetaTensor& beta1_pow,
-                     float beta1,
-                     float beta2,
-                     float epsilon,
-                     MetaTensor* param_out,
-                     MetaTensor* moment_out,
-                     MetaTensor* inf_norm_out) {
-  auto lr_dims = learning_rate.dims();
-  PADDLE_ENFORCE_NE(
-      product(lr_dims),
-      0,
-      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
-                              "been initialized. You may need to confirm "
-                              "if you put exe.run(startup_program) "
-                              "after optimizer.minimize function."));
-  PADDLE_ENFORCE_EQ(
-      product(lr_dims),
-      1,
-      errors::InvalidArgument("Learning rate should have 1 dimension"));
-  auto beta1_pow_dims = beta1_pow.dims();
-  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
-                    1,
-                    errors::InvalidArgument(
-                        "Beta1 power accumulator should have 1 dimension"));
-  auto param_dims = param.dims();
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      grad.dims(),
-      errors::InvalidArgument(
-          "Param and Grad input of AdamaxOp should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      moment.dims(),
-      errors::InvalidArgument(
-          "Param and Moment input of AdamaxOp should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      inf_norm.dims(),
-      errors::InvalidArgument(
-          "Param and InfNorm input of AdamaxOp should have same dimension"));
-
-  param_out->set_dims(param_dims);
-  param_out->set_dtype(param.dtype());
-
-  moment_out->set_dims(param_dims);
-  moment_out->set_dtype(moment.dtype());
-
-  inf_norm_out->set_dims(param_dims);
-  inf_norm_out->set_dtype(inf_norm.dtype());
-}
-
-void AdadeltaInferMeta(const MetaTensor& param,
-                       const MetaTensor& grad,
-                       const MetaTensor& avg_squared_grad,
-                       const MetaTensor& avg_squared_update,
-                       float rho,
-                       float epsilon,
-                       MetaTensor* param_out,
-                       MetaTensor* avg_squared_grad_out,
-                       MetaTensor* avg_squared_update_out) {
-  auto param_dims = param.dims();
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      grad.dims(),
-      errors::InvalidArgument(
-          "Param and grad input of AdadeltaOp should have same dimension."));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      avg_squared_grad.dims(),
-      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
-                              "should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      avg_squared_update.dims(),
-      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
-                              "should have same dimension"));
-
-  param_out->set_dims(param_dims);
-  param_out->set_dtype(param.dtype());
-
-  avg_squared_grad_out->set_dims(param_dims);
-  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
-
-  avg_squared_update_out->set_dims(param_dims);
-  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
-}
-
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -369,6 +369,81 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
   out->share_lod(*x.at(0));
 }
 
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out) {
+  auto input_dims = x.dims();
+  auto rois_dims = rois.dims();
+
+  PADDLE_ENFORCE_EQ(
+      input_dims.size(),
+      4,
+      errors::InvalidArgument("The format of input tensor is NCHW"));
+  PADDLE_ENFORCE_EQ(rois_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  PADDLE_ENFORCE_EQ(rois_dims[1],
+                    4,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  if (rois_num.get_ptr()) {
+    auto rois_num_dims = rois_num->dims();
+    PADDLE_ENFORCE_EQ(
+        rois_num_dims.size(),
+        1,
+        errors::InvalidArgument("The second dimension of RoisNum should "
+                                "be 1, but received dimension is %d",
+                                rois_num_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      input_dims[1],
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "the channel of X(%d) "
+          "should be equal to the product of "
+          "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+          input_dims[1],
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output height must be greater than 0"));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output width must be greater than 0"));
+  PADDLE_ENFORCE_GT(output_channels,
+                    1,
+                    errors::InvalidArgument(
+                        "The pooled output channels must greater than 1"));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      errors::InvalidArgument("The spatial scale must greater than 0."));
+
+  auto out_dims = input_dims;
+  out_dims[0] = rois_dims[0];
+  out_dims[1] =
+      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 26bdc62302f..c11843212ed 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -20,6 +20,29 @@ namespace phi {
 
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -47,32 +70,18 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out);
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
                     MetaTensor* out);
 
-void AdamaxInferMeta(const MetaTensor& param,
-                     const MetaTensor& grad,
-                     const MetaTensor& learning_rate,
-                     const MetaTensor& moment,
-                     const MetaTensor& inf_norm,
-                     const MetaTensor& beta1_pow,
-                     float beta1,
-                     float beta2,
-                     float epsilon,
-                     MetaTensor* param_out,
-                     MetaTensor* moment_out,
-                     MetaTensor* inf_norm_out);
-
-void AdadeltaInferMeta(const MetaTensor& param,
-                       const MetaTensor& grad,
-                       const MetaTensor& avg_squared_grad,
-                       const MetaTensor& avg_squared_update,
-                       float rho,
-                       float epsilon,
-                       MetaTensor* param_out,
-                       MetaTensor* avg_squared_grad_out,
-                       MetaTensor* avg_squared_update_out);
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
new file mode 100644
index 00000000000..fbed3f1cb13
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  if (dx) {
+    auto in_dims = x.dims();
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num_t = rois.dims()[0];
+
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      auto* rois_num_t_data = rois_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_t_data[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_t_data[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+    const T* input_rois = rois.data<T>();
+    const T* dout_data = dout.data<T>();
+    T* dx_data = ctx.template Alloc<T>(dx);
+
+    // set gradient of X to be 0. before backpropagate.
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    // backpropagate gradient per output pixel
+    int dout_size = dout.numel();
+    for (int i = 0; i < dout_size; ++i) {
+      // The output is in order (n, c, ph, pw)
+      int pw = i % pooled_width;
+      int ph = (i / pooled_width) % pooled_height;
+      int c = (i / pooled_width / pooled_height) % output_channels;
+      int n = i / pooled_width / pooled_height / output_channels;
+
+      // set roi_batch_id
+      int roi_batch_id = rois_batch_id_data[n];
+      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      int input_offset =
+          (roi_batch_id * input_channels + input_channel) * height * width;
+      T* offset_dx_data = dx_data + input_offset;
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+      int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+      int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+      int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+      // Add roi offsets and clip to input boundaries
+      hstart = std::min(std::max(hstart, 0), height);
+      hend = std::min(std::max(hend, 0), height);
+      wstart = std::min(std::max(wstart, 0), width);
+      wend = std::min(std::max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Accumulate diff_val into input data
+      T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+      T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+      for (int ih = hstart; ih < hend; ++ih) {
+        for (int iw = wstart; iw < wend; ++iw) {
+          int input_index = ih * width + iw;
+          offset_dx_data[input_index] += diff_val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, CPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
new file mode 100644
index 00000000000..06cd03395d9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num_t = rois.dims()[0];
+
+  PADDLE_ENFORCE_EQ(input_channels,
+                    output_channels * pooled_height * pooled_width,
+                    errors::InvalidArgument(
+                        "the channels of input "
+                        "X should equal the product of "
+                        "output_channels x pooled_height x pooled_width"));
+
+  auto in_stride = stride(in_dims);
+  auto out_stride = stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+
+  int rois_batch_size;
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            rois_batch_size,
+            batch_size));
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_data[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_data[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_data[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument("the rois_batch_size and input(X) "
+                                "batch_size should be the same."));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod,
+                      rois_num_t,
+                      errors::InvalidArgument(
+                          "the rois_num from input and lod must be the same"));
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = ctx.template Alloc<T>(out);
+  const T* input_rois = rois.data<T>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num_t; ++n) {
+    // set roi batch id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+    // Force too small rois to be 1 x 1
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute bin size w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // calculate each pixel of the output feature map.
+    int out_roi_offset = n * out_stride[0];
+    for (int c = 0; c < output_channels; ++c) {
+      // per category
+      int out_plane_offset = out_roi_offset + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        int out_row_offset = out_plane_offset + ph * out_stride[2];
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          // calculate w and h at input feature map
+          int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+          int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+          int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+          int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+          //  Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, 0), height);
+          wstart = std::min(std::max(wstart, 0), width);
+          hend = std::min(std::max(hend, 0), height);
+          wend = std::min(std::max(wend, 0), width);
+
+          int output_index = out_row_offset + pw;
+          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          int input_plane_offset =
+              roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+          const T* offset_input_data = input_data + input_plane_offset;
+          T out_sum = 0.;
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          for (int ih = hstart; ih < hend; ++ih) {
+            for (int iw = wstart; iw < wend; ++iw) {
+              int input_index = ih * in_stride[2] + iw;
+              out_sum += offset_input_data[input_index];
+            }
+          }
+          T bin_area = (hend - hstart) * (wend - wstart);
+          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, CPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
new file mode 100644
index 00000000000..6745653eba7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(const int nthreads,
+                                     const T* input_rois,
+                                     const T* dout_data,
+                                     const float spatial_scale,
+                                     const int input_channels,
+                                     const int height,
+                                     const int width,
+                                     const int output_channels,
+                                     const int pooled_height,
+                                     const int pooled_width,
+                                     const int* rois_batch_id_data,
+                                     T* dx_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_dx_data = dx_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  int rois_num_t = rois.dims()[0];
+  int input_channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (dx) {
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      paddle::memory::Copy(CPUPlace(),
+                           rois_num_list.data(),
+                           ctx.GetPlace(),
+                           rois_num->data<int>(),
+                           sizeof(int) * rois_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_list[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    DenseTensor rois_batch_id_list_gpu;
+    Copy(ctx,
+         rois_batch_id_list,
+         ctx.GetPlace(),
+         false,
+         &rois_batch_id_list_gpu);
+
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    int dout_size = dout.numel();
+    int blocks = NumBlocks(dout_size);
+    int threads = kNumCUDAThreads;
+
+    if (dout_size > 0) {
+      GPUPSROIPoolBackward<T><<<blocks, threads, 0, ctx.stream()>>>(
+          dout_size,
+          rois.data<T>(),
+          dout.data<T>(),
+          spatial_scale,
+          input_channels,
+          height,
+          width,
+          output_channels,
+          pooled_height,
+          pooled_width,
+          rois_batch_id_list_gpu.data<int>(),
+          ctx.template Alloc<T>(dx));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, GPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
new file mode 100644
index 00000000000..8f9be001ba7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(const int nthreads,
+                                    const T* input_data,
+                                    const T* input_rois,
+                                    const float spatial_scale,
+                                    const int input_channels,
+                                    const int height,
+                                    const int width,
+                                    const int output_channels,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int* rois_batch_id_data,
+                                    T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "The channels %d of input X should equal the product of "
+          "output_channels %d x pooled_height %d x pooled_width %d.",
+          input_channels,
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  int rois_num_t = rois.dims()[0];
+  if (rois_num_t == 0) return;
+  int rois_batch_size;
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(CPUPlace(),
+                         rois_num_list.data(),
+                         ctx.GetPlace(),
+                         rois_num_data,
+                         sizeof(int) * rois_batch_size,
+                         0);
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_list[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_list[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_list[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_t,
+                      rois_num_with_lod,
+                      errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          rois_num_with_lod));
+
+    // set rois batch id
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  DenseTensor rois_batch_id_list_gpu;
+  Copy(ctx, rois_batch_id_list, ctx.GetPlace(), false, &rois_batch_id_list_gpu);
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  // call cuda kernel function
+  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      rois.data<T>(),
+      spatial_scale,
+      input_channels,
+      height,
+      width,
+      output_channels,
+      pooled_height,
+      pooled_width,
+      rois_batch_id_list_gpu.data<int>(),
+      ctx.template Alloc<T>(out));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, GPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
new file mode 100644
index 00000000000..87163eb8e07
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
new file mode 100644
index 00000000000..341037af2ca
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc
new file mode 100644
index 00000000000..4d694d9a775
--- /dev/null
+++ b/paddle/phi/ops/compat/psroi_pool_sig.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool",
+      {"X", "ROIs", "RoisNum"},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {"Out"});
+}
+
+KernelSignature PsroiPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool_grad",
+      {"X", "ROIs", "RoisNum", GradVarName("Out")},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad,
+                           phi::PsroiPoolGradOpArgumentMapping);
-- 
GitLab


From 47459e989b7607d5dc7c230fff7870c4c95e9141 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Fri, 11 Mar 2022 10:40:41 +0100
Subject: [PATCH 266/272] refactor conv+relementwise_add (residual) (#40005)

---
 .../conv_elementwise_add_mkldnn_fuse_pass.cc  | 392 ++++++++----------
 .../conv_elementwise_add_mkldnn_fuse_pass.h   |  92 +---
 2 files changed, 177 insertions(+), 307 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index c537d057385..2403e60df39 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
@@ -135,157 +136,9 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .End();
 }
 
-ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
-        get_node_from_conv_op,
-    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_op{get_node_from_conv_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_op;
-  Node* conv_input;
-  Node* conv_filter;
-  Node* conv_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_identity;
-  Node* elementwise_add_out;
-
-  std::tie(conv_op, conv_input, conv_filter, conv_output) =
-      get_node_from_conv_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
-
-  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
-
-  if (HasFusedActivation(conv_op)) return;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
-
-  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-  conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_x_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_y_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_x_op{get_node_from_conv_x_op},
-      get_node_from_conv_y_op{get_node_from_conv_y_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_x_op;
-  Node* conv_x_input;
-  Node* conv_x_filter;
-  Node* conv_x_output;
-
-  Node* conv_y_op;
-  Node* conv_y_input;
-  Node* conv_y_filter;
-  Node* conv_y_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_out;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
-
-  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
-      get_node_from_conv_x_op(subgraph);
-  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
-      get_node_from_conv_y_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
-  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
-
-  Node* projection_node;
-  Node* residual_conv_op;
-  Node* residual_conv_output;
-
-  if (IsReachable(graph, conv_x_input, conv_y_output)) {
-    projection_node = conv_x_output;
-    residual_conv_op = conv_y_op;
-    residual_conv_output = conv_y_output;
-  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
-    projection_node = conv_y_output;
-    residual_conv_op = conv_x_op;
-    residual_conv_output = conv_x_output;
-  } else {
-    return;
-  }
-
-  if (HasFusedActivation(residual_conv_op)) return;
-
-  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-
-  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(projection_node, residual_conv_op);
-  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-std::tuple<Node*, Node*, Node*, Node*>
-ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
-    const patterns::Conv& conv_pattern,
-    const GraphPatternDetector::subgraph_t& subgraph) const {
-  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
-
-  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
-}
-
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
     const std::string& name_scope,
     const GraphWithStats& graph_with_stats) const {
-  ir::Graph* graph;
-  int stats;
-
-  std::tie(graph, stats) = graph_with_stats;
-
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
 
@@ -298,26 +151,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
       pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_conv_as_x_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_add_identity, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+
+    found_conv_as_x_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_x_count
+           << " conv (as x) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_x_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
@@ -335,26 +218,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
       conv_output);
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_conv_as_y_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_add_x, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(elementwise_add_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+
+    found_conv_as_y_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_y_count
+           << " conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_y_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
@@ -374,39 +287,84 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
-      &gpd, graph_with_stats,
-      [this,
-       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_x_pattern, subgraph);
-      },
-      [this,
-       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_y_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_projection_conv_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    Node* projection_node;
+    Node* residual_conv_op;
+    Node* residual_conv_output;
+    if (IsReachable(g, conv_x_input, conv_y_output)) {
+      projection_node = conv_x_output;
+      residual_conv_op = conv_y_op;
+      residual_conv_output = conv_y_output;
+    } else if (IsReachable(g, conv_y_input, conv_x_output)) {
+      projection_node = conv_y_output;
+      residual_conv_op = conv_x_op;
+      residual_conv_output = conv_x_output;
+    } else {
+      return;
+    }
+
+    if (HasFusedActivation(residual_conv_op)) return;
+
+    residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+
+    residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(projection_node, residual_conv_op);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+
+    found_projection_conv_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_projection_conv_count
+           << " projection conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_projection_conv_count + graph_with_stats.second);
 }
 
-void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
-  auto fused_graph_with_stats = FuseConvAsY(
-      name_scope_,
-      FuseConvAsX(name_scope_,
-                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
+  auto graph_with_stats =
+      FuseProjectionConv(name_scope_, std::make_pair(graph, 0));
+  graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats);
+  graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats);
 
-  LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n";
-  AddStatis(fused_graph_with_stats.second);
+  AddStatis(graph_with_stats.second);
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index c83335da2f6..c4351b38218 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,19 +28,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class Graph;
-class GraphPatternDetector;
-class Node;
-namespace patterns {
-struct Conv;
-}  // namespace patterns
-
-using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
-paddle::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
  private:
@@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
       const std::string& name_scope,
       const GraphWithStats& graph_with_stats) const;
 
-  template <typename RetType>
-  using GetNodeFunc =
-      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
-  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
-  using IdentityElementwiseAddFunc =
-      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
-
-  using ProjectionConvFunc = IdentityConvFunc;
-  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
-
-  using CanFuseFunc = std::function<bool(Node*, Node*)>;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  template <typename HandleType, typename... OpFuncs>
-  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
-                                      const GraphWithStats& graph_with_stats,
-                                      OpFuncs&&... op_funcs) const {
-    ir::Graph* graph;
-    int stats;
-
-    std::tie(graph, stats) = graph_with_stats;
-
-    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
-      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
-    };
-    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
-
-    (*gpd)(graph, fuse_handle);
-
-    return std::make_pair(graph, stats + fuse_handle.get_stats());
-  }
-
-  struct IdentityFuseHandle {
-    IdentityFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const IdentityConvFunc& get_node_from_conv_op,
-        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    IdentityConvFunc get_node_from_conv_op;
-    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
-  struct ProjectionFuseHandle {
-    ProjectionFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const ProjectionConvFunc& get_node_from_conv_x_op,
-        const ProjectionConvFunc& get_node_from_conv_y_op,
-        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    ProjectionConvFunc get_node_from_conv_x_op;
-    ProjectionConvFunc get_node_from_conv_y_op;
-    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
  public:
   ResidualConnectionMKLDNNFusePass();
   virtual ~ResidualConnectionMKLDNNFusePass() {}
 
  protected:
-  void ApplyImpl(graph_ptr graph) const;
+  void ApplyImpl(ir::Graph* graph) const;
+
   static bool HasFusedActivation(Node* conv_node) {
     return !(conv_node->Op()
                  ->GetAttrIfExists<std::string>("fuse_activation")
-- 
GitLab


From 6d830f6cf009d2e912ff9e9b102162e1b565dd7e Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 11 Mar 2022 17:51:51 +0800
Subject: [PATCH 267/272] Added Final State Matmul_v2 to C++ performance test
 (#40391)

---
 paddle/fluid/eager/CMakeLists.txt             |  2 +-
 paddle/fluid/eager/autograd_meta.h            |  3 +-
 .../performance_tests/benchmark_eager_cpu.cc  | 41 +++++++++++++++++
 .../performance_tests/benchmark_eager_cuda.cc | 44 +++++++++++++++++++
 .../performance_tests/benchmark_utils.cc      | 24 ++++++++++
 .../tests/performance_tests/benchmark_utils.h | 11 ++---
 6 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 698a698fc6d..f9d1b705390 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
-set(generated_deps dygraph_function dygraph_node)
+set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     message("Performing Eager Dygraph Auto Code Generation")
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 9e1dc4f2c8c..dca76d3b8a0 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta {
  private:
   // TODO(jiabin) :Should we use pointer instead of object?
   std::shared_ptr<paddle::experimental::Tensor> grad_{
-      std::make_shared<paddle::experimental::Tensor>(
-          egr::Controller::Instance().GenerateUniqueName("@grad"))};
+      std::make_shared<paddle::experimental::Tensor>()};
 
   // GradNodeBase is base class of all grad op which is a
   // wrapper for grad op. This class will make grad op easy
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index af365322e60..adb3246ee8c 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -80,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCPU) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  for (const std::string& mode : {"Accuracy", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cpu.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 5b75f1242e6..bd70e84d9b4 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -82,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCUDA) {
+  paddle::platform::CUDAPlace place;
+  eager_test::InitEnv(place);
+
+  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "WarmUp") {
+      benchmark_eager_matmul(X, Y);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cuda.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
   eager_test::InitEnv(place);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 96126fa5466..769bd7f687f 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/utils.h"
 
 // Eager Generated
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 
 // Fluid
@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 }
 
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
+                            bool accuracy_check) {
+  paddle::experimental::Tensor input_tensor0 = X;
+
+  size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
+  for (size_t i = 0; i < max_num_runs; i++) {
+    input_tensor0 =
+        matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
+  }
+
+  std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
+  RunBackward(target_tensors, {});
+
+  if (accuracy_check) {
+    // Examine Forward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareTensorWithValue<float>(input_tensor0, 16);
+    // Examine Backward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareGradTensorWithValue<float>(X, 16);
+    eager_test::CompareGradTensorWithValue<float>(Y, 16);
+  }
+}
+
 /* ----------------------------------- */
 /* ---- Eager Intermediate Matmul ---- */
 /* ----------------------------------- */
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 0086b51b57e..86bf13707ed 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
                            bool accuracy_check = false);
 
 /* ---- Eager MatMul ---- */
-/*
-void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const
-paddle::experimental::Tensor& Y,
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
                             bool accuracy_check = false);
-void benchmark_eager_mlp(const paddle::experimental::Tensor& X,
-                         const std::vector<paddle::experimental::Tensor>& Ws,
-                         const std::vector<paddle::experimental::Tensor>& Bs,
-                         bool accuracy_check = false);
-*/
+
 void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
                                          const paddle::experimental::Tensor& Y,
                                          bool accuracy_check = false);
-- 
GitLab


From 88c03071cda368844f84305b25c28c5071d91965 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 11 Mar 2022 18:10:52 +0800
Subject: [PATCH 268/272] polish trace op detail (#40425)

---
 paddle/fluid/operators/trace_op.cc | 4 ++--
 paddle/phi/infermeta/unary.cc      | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 0590b66f6f8..c6c0fa3c001 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2.
 )DOC");
   }
 };
-class TraceOpGrad : public framework::OperatorWithKernel {
+class TraceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -114,7 +114,7 @@ REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
                   TraceInferShapeFunctor);
 
-REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad,
+REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index d6d4efad9fa..9daad7d6aaa 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -837,6 +837,7 @@ void TraceInferMeta(
     sizes.erase(sizes.begin() + std::min(dim1_, dim2_));
   }
   out->set_dims(phi::make_ddim(sizes));
+  out->set_dtype(x.dtype());
 }
 
 void DiagonalInferMeta(const MetaTensor& input,
-- 
GitLab


From 17d8a5e0c270206218891d6f41ffda3271f26c4a Mon Sep 17 00:00:00 2001
From: Feng Xing <79969986+xingfeng01@users.noreply.github.com>
Date: Fri, 11 Mar 2022 18:20:46 +0800
Subject: [PATCH 269/272] Separate include and macro in kp top level file
 (#40202)

* format softmax forward

* seperate include and macro to two if-else
---
 .../phi/kernels/primitive/kernel_primitives.h | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index 830bc1972c4..b5a1e88acc3 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
+
+// macro
 #ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
@@ -22,11 +25,6 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
-
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -42,11 +40,8 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
+
 #else
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
@@ -67,4 +62,22 @@
 #define GRID_NUM_X gridDim.x
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
+
+#endif
+
+// include file
+#ifdef PADDLE_WITH_XPU_KP
+
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
+#else
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
 #endif
-- 
GitLab


From 0d78e491a77d922102e8493ca68c638b33ebdaed Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Fri, 11 Mar 2022 18:49:02 +0800
Subject: [PATCH 270/272] Submanifold convolution (#40363)

submanifold convolution
---
 .../kernels/sparse/convolution_grad_kernel.h  |   5 +-
 .../phi/kernels/sparse/convolution_kernel.h   |  14 +-
 paddle/phi/kernels/sparse/cpu/convolution.h   |  27 ++-
 .../sparse/cpu/convolution_grad_kernel.cc     |  71 ++++--
 .../kernels/sparse/cpu/convolution_kernel.cc  |   2 +
 .../cpu/submanifold_convolution_kernel.cu     |  30 +++
 .../phi/kernels/sparse/gpu/convolution.cu.h   |   6 +-
 .../sparse/gpu/convolution_grad_kernel.cu     | 115 +++++----
 .../kernels/sparse/gpu/convolution_kernel.cu  | 226 ++++++++++++++++--
 paddle/phi/tests/api/test_sparse_conv_api.cc  |   2 +-
 .../kernels/test_sparse_conv3d_dev_api.cc     | 116 ++++++++-
 python/paddle/utils/code_gen/sparse_api.yaml  |   2 +-
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |   6 +-
 13 files changed, 521 insertions(+), 101 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu

diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index f4265d303d7..42bde442e1e 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -32,6 +32,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad);
 
@@ -44,7 +45,8 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const std::vector<int>& paddings,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
-                                    const int groups) {
+                                    const int groups,
+                                    const bool subm) {
   DenseTensor x_grad =
       phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   DenseTensor kernel_grad = phi::Empty<Context>(
@@ -59,6 +61,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                dilations,
                                strides,
                                groups,
+                               subm,
                                &x_grad,
                                &kernel_grad);
   std::vector<DenseTensor> out(2);
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index cfb451afdcb..778600a2285 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -125,6 +125,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook);
 
@@ -136,14 +137,23 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
+                       const bool subm,
                        DenseTensor* rulebook) {
   DenseTensor indices = phi::Empty<Context>(
       dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
   DenseTensor values =
       phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
-  Conv3dKernel<T, Context>(
-      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  Conv3dKernel<T, Context>(dev_ctx,
+                           x,
+                           kernel,
+                           paddings,
+                           dilations,
+                           strides,
+                           groups,
+                           subm,
+                           &coo,
+                           rulebook);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index bcb6db40788..a5a946dce79 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -39,6 +39,7 @@ void ProductRuleBook(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      const DDim& out_dims,
+                     const bool subm,
                      DenseTensor* rulebook,
                      DenseTensor* counter_per_kernel) {
   const auto& kernel_dims = kernel.dims();
@@ -59,11 +60,24 @@ void ProductRuleBook(const Context& dev_ctx,
   const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
   const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
 
+  std::set<int> hash_in;
+  if (subm) {
+    for (int i = 0; i < non_zero_num; i++) {
+      int batch = indices_ptr[i];
+      int in_z = indices_ptr[i + non_zero_num];
+      int in_y = indices_ptr[i + 2 * non_zero_num];
+      int in_x = indices_ptr[i + 3 * non_zero_num];
+      int index = PointToIndex<DDim>(batch, in_x, in_y, in_z, x_dims);
+      hash_in.insert(index);
+    }
+  }
+
   auto f_calc_rulebook = [&](int* rulebook_ptr) {
     int kernel_index = 0, rulebook_index = 0;
     for (int kz = 0; kz < kernel_dims[0]; kz++) {
       for (int ky = 0; ky < kernel_dims[1]; ky++) {
         for (int kx = 0; kx < kernel_dims[2]; kx++) {
+          ++kernel_index;
           for (int64_t i = 0; i < non_zero_num; i++) {
             int batch = indices_ptr[i];
             int in_z = indices_ptr[i + non_zero_num];
@@ -83,11 +97,19 @@ void ProductRuleBook(const Context& dev_ctx,
                       kx,
                       ky,
                       kz)) {
+              if (subm) {
+                int out_index =
+                    PointToIndex<DDim>(batch, out_x, out_y, out_z, out_dims);
+                if (hash_in.find(out_index) == hash_in.end()) {
+                  continue;
+                }
+              }
+
               if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index] += 1;
+                counter_ptr[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
-                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index] = kernel_index - 1;
                 rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
                 rulebook_ptr[rulebook_index + rulebook_len * 2] =
                     PointToIndex<DDim>(
@@ -96,7 +118,6 @@ void ProductRuleBook(const Context& dev_ctx,
               }
             }
           }
-          ++kernel_index;
         }
       }
     }
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 6ee265a3296..bb414faef67 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -38,6 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -70,32 +71,72 @@ void Conv3dGradKernel(const Context& dev_ctx,
   T* d_kernel_ptr = kernel_grad->data<T>();
   memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
 
-  Gather<T>(x.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len,
-            rulebook_len,
-            in_channels,
-            in_features_ptr);
-  Gather<T>(out_grad.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len * 2,
-            rulebook_len,
-            out_channels,
-            out_grad_features_ptr);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
   for (int i = 0; i < rulebook_len; i++) {
     counter[rulebook_ptr[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              x.non_zero_elements().dims()[1],
+              out_grad.non_zero_elements().dims()[1],
+              x.non_zero_elements().dims()[0],
+              static_cast<T>(1),
+              x.non_zero_elements().data<T>(),
+              out_grad.non_zero_elements().data<T>(),
+              static_cast<T>(0),
+              d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    T* x_grad_ptr = x_grad->data<T>();
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              out_grad.non_zero_elements().dims()[0],
+              in_channels,
+              out_grad.non_zero_elements().dims()[1],
+              static_cast<T>(1),
+              out_grad.non_zero_elements().data<T>(),
+              kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+              static_cast<T>(0),
+              x_grad_ptr);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -136,10 +177,6 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
   Scatter<T>(d_x_features_ptr,
              rulebook.data<int>() + rulebook_len,
              rulebook_len,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 64ef068e03a..f65e1cf579a 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -35,6 +35,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -63,6 +64,7 @@ void Conv3dKernel(const Context& dev_ctx,
                               dilations,
                               strides,
                               out_dims,
+                              subm,
                               rulebook,
                               &counter_per_kernel);
 
diff --git a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu
new file mode 100644
index 00000000000..5f6d24093a4
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <set>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/submanifold_convolution_kernel.h"
+
+namespace phi {
+namespace sparse {}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 03a6aaa6894..8826fd7cf87 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -71,7 +71,8 @@ __global__ void ScatterKernel(const T* input,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out) {
+                              T* out,
+                              const bool subm = false) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
     int indices_i = i / channels;
@@ -82,6 +83,9 @@ __global__ void ScatterKernel(const T* input,
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
     T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
       sum += input[out_feature_i * channels + channels_i];
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 861f18f36e6..a307ab0f546 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -43,6 +43,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -69,37 +70,18 @@ void Conv3dGradKernel(const Context& dev_ctx,
   T* in_features_ptr = in_features.data<T>();
   T* d_x_features_ptr = d_x_features.data<T>();
   T* out_grad_features_ptr = out_grad_features.data<T>();
-  kernel_grad->Resize(kernel_dims);
-  dev_ctx.Alloc(
-      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  kernel_grad->ResizeAndAllocate(kernel_dims);
   T* d_kernel_ptr = kernel_grad->data<T>();
   phi::funcs::SetConstant<Context, T> set_zero;
   set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-  GatherKernel<T, int><<<config.block_per_grid.x,
-                         config.thread_per_block.x,
-                         0,
-                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                             rulebook_ptr + rulebook_len,
-                                             in_features_ptr,
-                                             rulebook_len,
-                                             in_channels);
-
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, int><<<config.block_per_grid.x,
-                         config.thread_per_block.x,
-                         0,
-                         dev_ctx.stream()>>>(
-      out_grad.non_zero_elements().data<T>(),
-      rulebook_ptr + rulebook_len * 2,
-      out_grad_features_ptr,
-      rulebook_len,
-      out_channels);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
       h_counter(rulebook_len, 0);
   phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
@@ -117,16 +99,72 @@ void Conv3dGradKernel(const Context& dev_ctx,
   for (int i = 0; i < rulebook_len; i++) {
     counter[h_counter[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              x.non_zero_elements().dims()[1],
+              out_grad.non_zero_elements().dims()[1],
+              x.non_zero_elements().dims()[0],
+              static_cast<T>(1),
+              x.non_zero_elements().data<T>(),
+              out_grad.non_zero_elements().data<T>(),
+              static_cast<T>(0),
+              d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    T* x_grad_ptr = x_grad->data<T>();
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              out_grad.non_zero_elements().dims()[0],
+              in_channels,
+              out_grad.non_zero_elements().dims()[1],
+              static_cast<T>(1),
+              out_grad.non_zero_elements().data<T>(),
+              kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+              static_cast<T>(0),
+              x_grad_ptr);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(
+      out_grad.non_zero_elements().data<T>(),
+      rulebook_ptr + rulebook_len * 2,
+      out_grad_features_ptr,
+      rulebook_len,
+      out_channels);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -167,19 +205,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-
-  DenseTensor out_index = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-  DenseTensor unique_key = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-  DenseTensor unique_value = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
 
   SortedAndUniqueIndex(dev_ctx,
                        rulebook_ptr + rulebook_len,
@@ -200,7 +230,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                                          x.nnz(),
                                          rulebook_len,
                                          in_channels,
-                                         x_grad_values_ptr);
+                                         x_grad_values_ptr,
+                                         subm);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 4a533d9d1d5..94186600f1e 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
@@ -32,6 +33,34 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              int* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+
 /**
  * @brief: update the out index and indices
  * unique_keys: save the index of the output feature list
@@ -95,8 +124,10 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
                                       const Dims4D paddings,
                                       const Dims4D dilations,
                                       const Dims4D strides,
+                                      const bool subm,
                                       int* rulebook,
-                                      int* counter) {
+                                      int* counter,
+                                      int* in_indexs) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   extern __shared__ int counter_buf[];  // kernel_size
   const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
@@ -108,13 +139,16 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
 
   for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
     int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
     for (int kz = 0; kz < kernel_dims[1]; kz++) {
       for (int ky = 0; ky < kernel_dims[2]; ky++) {
         for (int kx = 0; kx < kernel_dims[3]; kx++) {
-          int batch = x_indices[i];
-          int in_z = x_indices[i + non_zero_num];
-          int in_y = x_indices[i + 2 * non_zero_num];
-          int in_x = x_indices[i + 3 * non_zero_num];
           int in_i = -1, out_index = -1, kernel_i = -1;
           if (Check(x_dims,
                     kernel_dims,
@@ -182,6 +216,7 @@ int ProductRuleBook(const Context& dev_ctx,
                     const std::vector<int>& dilations,
                     const std::vector<int>& strides,
                     const DDim& out_dims,
+                    const bool subm,
                     DenseTensor* rulebook,
                     DenseTensor* counter_per_kernel,
                     DenseTensor* offsets_per_kernel,
@@ -195,13 +230,14 @@ int ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
   int* counter_ptr = counter_per_kernel->data<int>();
   int* offsets_ptr = offsets_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int rulebook_rows = 3;
   const int rulebook_cols = kernel_size * non_zero_num;
   rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
-  dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
   int* rulebook_ptr = rulebook->data<int>();
 
   const auto x_dims = x.dims();
@@ -229,8 +265,10 @@ int ProductRuleBook(const Context& dev_ctx,
                                               d_paddings,
                                               d_dilations,
                                               d_strides,
+                                              subm,
                                               rulebook_ptr,
-                                              counter_ptr);
+                                              counter_ptr,
+                                              in_indexs.data<int>());
 
 // 2. remove -1
 #ifdef PADDLE_WITH_HIP
@@ -242,6 +280,144 @@ int ProductRuleBook(const Context& dev_ctx,
                              rulebook_ptr + rulebook_rows * rulebook_cols,
                              -1);
 
+  DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+
+    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<<<config.block_per_grid.x,
+                                    config.thread_per_block,
+                                    kernel_size * sizeof(int),
+                                    dev_ctx.stream()>>>(val_result.data<int>(),
+                                                        len,
+                                                        rulebook_len,
+                                                        kernel_size,
+                                                        rulebook_ptr,
+                                                        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
 #else
@@ -274,23 +450,14 @@ int ProductRuleBook(const Context& dev_ctx,
                                      cudaMemcpyDeviceToHost,
                                      dev_ctx.stream());
 #endif
-  dev_ctx.Wait();
-  int rulebook_len =
-      (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
   rulebook->Resize({rulebook_rows, rulebook_len});
 
   // 3. sorted or merge the out index
   out_index->ResizeAndAllocate({rulebook_len});
   unique_value->ResizeAndAllocate({rulebook_len});
   unique_key->ResizeAndAllocate({rulebook_len});
-  dev_ctx.Alloc(
-      out_index, out_index->dtype(), sizeof(int) * out_index->numel());
   int* out_index_ptr = out_index->data<int>();
-  dev_ctx.Alloc(
-      unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
   int* unique_value_ptr = unique_value->data<int>();
-  dev_ctx.Alloc(
-      unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
   int* unique_key_ptr = unique_key->data<int>();
 
   int* new_end = SortedAndUniqueIndex(dev_ctx,
@@ -364,6 +531,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -389,20 +557,28 @@ void Conv3dKernel(const Context& dev_ctx,
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
   DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensor out_index = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
-  DenseTensor unique_key = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
-  DenseTensor unique_value = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    auto kernel_dims = kernel.dims();
+    for (int i = 0; i < paddings.size(); i++) {
+      subm_paddings[i] = kernel_dims[i] / 2;
+      subm_strides[i] = 1;
+    }
+  }
 
   int n = ProductRuleBook<T, Context>(dev_ctx,
                                       x,
                                       kernel,
-                                      paddings,
+                                      subm_paddings,
                                       dilations,
-                                      strides,
+                                      subm_strides,
                                       out_dims,
+                                      subm,
                                       rulebook,
                                       &counter_per_kernel,
                                       &offsets_per_kernel,
@@ -428,6 +604,8 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(out_features_meta));
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 76cb01d8a8b..7c4aa164259 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -78,7 +78,7 @@ void TestConv3dBase(const std::vector<int>& indices,
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
     auto outs = paddle::experimental::sparse::conv3d(
-        x, weight, paddings, dilations, strides, 1);
+        x, weight, paddings, dilations, strides, 1, false);
 
     auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
         std::get<0>(outs).impl());
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index c1a8b853b32..37a69a176c6 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -64,7 +64,8 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const float diff = 1e-3,
                     const bool backward = false,
                     const std::vector<T> features_grad = {},
-                    const std::vector<T> kernel_grad = {}) {
+                    const std::vector<T> kernel_grad = {},
+                    const bool subm = false) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -114,6 +115,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &rulebook);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
@@ -138,7 +140,8 @@ void TestConv3dBase(const std::vector<int>& indices,
                                                              paddings,
                                                              dilations,
                                                              strides,
-                                                             1);
+                                                             1,
+                                                             subm);
       f_verify(grads[0].data<T>(), features_grad);
       f_verify(grads[1].data<T>(), kernel_grad);
     }
@@ -191,6 +194,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &d_rulebook);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -235,7 +239,8 @@ void TestConv3dBase(const std::vector<int>& indices,
                                                            paddings,
                                                            dilations,
                                                            strides,
-                                                           1);
+                                                           1,
+                                                           subm);
     DenseTensor h_features_grad = phi::Empty(
         dev_ctx_cpu,
         DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
@@ -266,7 +271,8 @@ void TestConv3d(const std::vector<int>& indices,
                 const float diff = 1e-3,
                 const bool backward = false,
                 const std::vector<float> features_grad = {},
-                const std::vector<float> kernel_grad = {}) {
+                const std::vector<float> kernel_grad = {},
+                const bool subm = false) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -283,7 +289,8 @@ void TestConv3d(const std::vector<int>& indices,
                         diff,
                         backward,
                         features_grad,
-                        kernel_grad);
+                        kernel_grad,
+                        subm);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -300,7 +307,8 @@ void TestConv3d(const std::vector<int>& indices,
                          diff,
                          backward,
                          cast<float, double>(features_grad),
-                         cast<float, double>(kernel_grad));
+                         cast<float, double>(kernel_grad),
+                         subm);
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -661,5 +669,101 @@ TEST(DEV_API, sparse_conv3d_backward) {
              kernel_grad);
 }
 
+TEST(DEV_API, sparse_conv2d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 4, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 4, 5, out_channels};
+  std::vector<int> paddings = {0, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<int> indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> features = {0.8854, 0.6505, -0.1999, 0.3583};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.9364, 0.9460, 0.6564, 0.7999, 0.2013, 0.3812, 0.5474, 0.1016, 0.3368};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> out_features = {0.1782, 0.2313, 0.7117, 0.5214};
+
+  std::vector<float> features_grad = {0.0359, 1.2080, 0.5838, 0.4541};
+  std::vector<float> kernel_grad = {
+      0.3391, 0.4630, 0.0000, -0.1042, 0.3528, 0.2550, 0.0000, -0.0462, 0.0829};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
+TEST(DEV_API, sparse_conv3d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 5, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 4, 4, 5, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> features = {-0.9578, 0.1572, 0.1036};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.1367, 0.4534, 0.2138, 0.8264, 0.7534, 0.3270, 0.2880, 0.1562, 0.7770,
+      0.6902, 0.1981, 0.1369, 0.6582, 0.7582, 0.5640, 0.8894, 0.7350, 0.1845,
+      0.6892, 0.3654, 0.6076, 0.0326, 0.8412, 0.5289, 0.9824, 0.8235, 0.9802};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> out_features = {-0.7262, 0.1192, 0.0785};
+
+  std::vector<float> features_grad = {-0.5506, 0.0904, 0.0595};
+  std::vector<float> kernel_grad = {
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.7224, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 2f233a2df35..9c859022e8a 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,5 +1,5 @@
 - api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   kernel :
     func : sparse_conv3d
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 8c9f02ebb31..6532f103cbf 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -1,6 +1,6 @@
 - backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups)
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
   kernel :
-    func : sparse_conv_grad
+    func : sparse_conv3d_grad
-- 
GitLab


From f70f5e4fdafdf31276d9adee02a3d41e0600b778 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Fri, 11 Mar 2022 18:58:28 +0800
Subject: [PATCH 271/272] fix the bug for processgroup_hccl compiling (#40437)

---
 .../collective/ProcessGroupHCCL.cc            |  8 +++----
 .../distributed/collective/ProcessGroupHCCL.h | 23 -------------------
 paddle/fluid/pybind/CMakeLists.txt            |  2 +-
 3 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 84f5ca48d25..2deeb7ca030 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -139,11 +139,9 @@ bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
 // TODO(sandyhouse): Add timeout for wait, now timeout unused
 bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
   SynchronizeStreams();
-  if (FLAGS_hccl_blocking_wait) {
-    // NOTE(sandyhouse): It will block host for sync
-    while (!IsCompleted()) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
-    }
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
   }
   return true;
 }
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index f2376b4eed7..83d509be2cd 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -84,29 +84,6 @@ class ProcessGroupHCCL : public ProcessGroup {
       std::vector<Tensor>& tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Task> Barrier(
-      const BarrierOptions& = BarrierOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
-                                           int dst_rank) override;
-
-  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
-                                           int src_rank) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllGather(
-      std::vector<Tensor>& in_tensors,
-      std::vector<Tensor>& out_tensors) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllToAll(
-      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
-
-  std::shared_ptr<ProcessGroup::Task> Reduce(
-      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
-
-  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
-                                              std::vector<Tensor>& out_tensors,
-                                              const ScatterOptions&) override;
-
  protected:
   virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6c8fc450cd4..8ee22590b6d 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -92,7 +92,7 @@ if(NOT ON_INFER)
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
-  if(WITH_ASCEND)
+  if(WITH_ASCEND_CL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
   endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
-- 
GitLab


From d1811010bbe0f2666696d3403be2a45a0bfdd7fb Mon Sep 17 00:00:00 2001
From: Tomasz Socha <tomasz.socha@intel.com>
Date: Fri, 11 Mar 2022 14:35:51 +0100
Subject: [PATCH 272/272] Use OneDNN's LayerNorm kernel (#40418)

---
 paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 90e6a36220a..812c55cdd50 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -150,4 +150,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 // TODO(jczaja): Enable FP32 when performance is good
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LayerNormMKLDNNOpKernel<float>,
                    ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
-- 
GitLab