Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_sgd_to_phi

a4bccde0 · phlrain · 26aac8d8 · ce8ed978 · a4bccde0 · a4bccde0
151 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()

 if(NOT DEFINED XPU_BASE_URL)
  SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
 else()
  SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME)
      else()
        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
        find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
      endif()
      if (xpu_library_DEPS)
        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})

--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
            elseif (${kernel_path} MATCHES "./gpudnn\/")
                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./kps\/")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
            else ()
                # deal with device independent kernel, now we use CPU temporaary
                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -97,6 +99,7 @@ function(kernel_library TARGET)
    set(gpu_srcs)
    set(xpu_srcs)
    set(gpudnn_srcs)
+    set(kps_srcs)
    set(selected_rows_srcs)
    # parse and save the deps kerenl targets
    set(all_srcs)
@@ -128,6 +131,9 @@ function(kernel_library TARGET)
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
            endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+            endif()
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
            endif()
@@ -137,6 +143,15 @@ function(kernel_library TARGET)
                list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
            endif()
        endif()
+        if (WITH_XPU_KP)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                # Change XPU2 file suffix
+                # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
+                file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+                list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+            endif()
+        endif()
    else()
        # TODO(chenweihang): impl compile by source later
    endif()
@@ -150,6 +165,7 @@ function(kernel_library TARGET)
    list(APPEND all_srcs ${gpu_srcs})
    list(APPEND all_srcs ${xpu_srcs})
    list(APPEND all_srcs ${gpudnn_srcs})
+    list(APPEND all_srcs ${kps_srcs})
    foreach(src ${all_srcs})
        file(READ ${src} target_content)
        string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
@@ -176,11 +192,20 @@ function(kernel_library TARGET)
    list(LENGTH gpu_srcs gpu_srcs_len)
    list(LENGTH xpu_srcs xpu_srcs_len)
    list(LENGTH gpudnn_srcs gpudnn_srcs_len)
+    list(LENGTH kps_srcs kps_srcs_len)
    list(LENGTH selected_rows_srcs selected_rows_srcs_len)

+    # kernel source file level
+    # level 1: base device kernel
+    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+    # level 2: device-independent kernel
+    # - common_srcs
+    # level 3: Kernel implemented by reusing device-independent kernel
+    # - selected_rows_srcs
+
    # Build Target according different src organization
    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
+        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
        if (WITH_GPU)
@@ -193,6 +218,11 @@ function(kernel_library TARGET)
                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
            endif()
+        elseif (WITH_XPU_KP)
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+                xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
+            endif()
        else()
            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -200,7 +230,7 @@ function(kernel_library TARGET)
            endif()
        endif()
    # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
        if (WITH_GPU)
            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -209,6 +239,10 @@ function(kernel_library TARGET)
            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
            endif()
+        elseif (WITH_XPU_KP)
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+                xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
        else()
            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -222,6 +256,9 @@ function(kernel_library TARGET)
        elseif (WITH_ROCM)
            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
        else()
            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
@@ -232,6 +269,8 @@ function(kernel_library TARGET)
            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
        elseif (WITH_ROCM)
            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
        else()
            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
        endif()
@@ -240,6 +279,8 @@ function(kernel_library TARGET)
            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
        elseif (WITH_ROCM)
            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
        else()
            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
        endif()
@@ -249,7 +290,7 @@ function(kernel_library TARGET)

    if (${target_build_flag} EQUAL 1)
        if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
+            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
            ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
            # append target into PHI_KERNELS property
            get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
@@ -275,6 +316,9 @@ function(kernel_library TARGET)
        if (${gpudnn_srcs_len} GREATER 0)
            kernel_declare(${gpudnn_srcs})
        endif()
+        if (${kps_srcs_len} GREATER 0)
+            kernel_declare(${kps_srcs})
+        endif()
        if (${selected_rows_srcs_len} GREATER 0)
            kernel_declare(${selected_rows_srcs})
        endif()

--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)

 if(WITH_NCCL)
    cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)

--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/reducer.h"
+#include "paddle/phi/common/data_type.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor> tensors,
+    const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), is_sparse_gradient.size(),
+      platform::errors::PreconditionNotMet(
+          "tensors len must be equal to is_sparse_gradient len, but "
+          "[%lu] != [%lu]",
+          tensors.size(), is_sparse_gradient.size()));
+  auto check_perm = [](const std::vector<int64_t> &x) -> bool {
+    size_t len = x.size();
+    std::vector<size_t> cnt(len, 0);
+    for (size_t i = 0; i < len; ++i) {
+      if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
+        return false;
+      }
+      cnt[x[i]]++;
+    }
+    return true;
+  };
+
+  PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
+                    platform::errors::PreconditionNotMet(
+                        "tensor_indices must be a permutation from 0 to %lu",
+                        tensor_indices.size()));
+  // the return vector
+  std::vector<std::vector<size_t>> res;
+
+  // Key: the var type
+  // Value: should use which index in group_size_limits for group size limit
+  std::map<experimental::DataType, size_t> group_limit_index;
+
+  // Key: the var type
+  // Value: <the var index in input tensors, total numel in this group>
+  std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
+      next_group;
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const auto &var = tensors[i];
+
+    size_t tensor_real_index = i;
+    if (!tensor_indices.empty()) {
+      tensor_real_index = tensor_indices[i];
+    }
+
+    if (is_sparse_gradient[tensor_real_index]) {
+      // we keep sparse var a single group
+      res.push_back({tensor_real_index});
+      continue;
+    }
+
+    const auto &var_dtype = var.dtype();
+    VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
+    auto &group_info = next_group[var_dtype];
+
+    int64_t var_size = -1;
+
+    if (var.is_dense_tensor()) {
+      var_size =
+          std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
+    } else {
+      VLOG(3) << "var " << var.name()
+              << " is not tensor or selected_rows, so skip it";
+      continue;
+    }
+
+    group_info.first.push_back(tensor_real_index);
+    group_info.second += experimental::SizeOf(var_dtype) * var_size;
+    // group_info.second += framework::SizeOfType(var_dtype) * var_size;
+
+    if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
+      // means it is the first var of var_dtype
+      group_limit_index[var_dtype] = 0;
+    }
+    auto &cur_limit_index = group_limit_index[var_dtype];
+    if (group_info.second >= group_size_limits[cur_limit_index]) {
+      // exceed group capacity and create a new group
+      res.emplace_back(std::move(group_info.first));
+      group_info = std::pair<std::vector<size_t>, size_t>();
+      cur_limit_index =
+          (std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
+    }
+  }
+
+  // add the final groups
+  for (auto &e : next_group) {
+    auto &group_info = e.second;
+    if (!group_info.first.empty()) {
+      res.emplace_back(std::move(group_info.first));
+    }
+  }
+
+  for (const auto &group_index : res) {
+    PADDLE_ENFORCE_NE(
+        group_index.empty(), true,
+        platform::errors::PreconditionNotMet(
+            "AssignGroupBySize construct empty group, please check."));
+  }
+  if (tensor_indices.empty()) {
+    std::sort(res.begin(), res.end(),
+              [](const std::vector<size_t> &x, const std::vector<size_t> &y) {
+                return x.front() < y.front();
+              });
+  }
+  return res;
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,41 +14,19 @@

 #pragma once

-#include <string>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include <map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"

 namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the Conv and ConvAffineChannel.
- */
-class Graph;
-
-class ConvAffineChannelFusePass : public FusePassBase {
- public:
-  ConvAffineChannelFusePass();
-  virtual ~ConvAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_affine_channel_fuse"};
-};
-
-class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
- public:
-  ConvEltwiseAddAffineChannelFusePass();
-  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
+namespace distributed {
+using Tensor = paddle::experimental::Tensor;

- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
-};
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
+    const std::vector<size_t>& group_size_limits,
+    const std::vector<int64_t>& tensor_indices = {});

-}  // namespace ir
-}  // namespace framework
+}  //  namespace distributed
 }  //  namespace paddle
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
  // TODO(chenweihang): support multiple inputs and outputs later
  phi::InferMetaContext infer_mete_context;
  for (auto& in_name : input_names) {
-    if (ctx->HasInput(in_name)) {
-      infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
-          ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
+    if (ctx->HasInputs(in_name)) {
+      auto input_var = ctx->GetInputVarPtrs(in_name);
+      if (input_var.size() == 1) {
+        infer_meta_context.EmplaceBackInput(
+            std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
      } else {
-      infer_meta_context.EmplaceBackInput({nullptr});
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
+        inputs.reserve(input_var.size());
+        for (const auto& in : input_var) {
+          inputs.push_back(
+              std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
        }
+        infer_meta_context.EmplaceBackInputs(std::move(inputs));
      }
-
-  for (auto& out_name : output_names) {
-    if (ctx->HasOutput(out_name)) {
-      infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
-          ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
    } else {
-      infer_meta_context.EmplaceBackOutput({nullptr});
+      infer_meta_context.EmplaceBackInput({nullptr});
    }
  }
+
  auto attr_reader = ctx->Attrs();
  for (size_t i = 0; i < attr_names.size(); ++i) {
    auto attr_name = attr_names[i];
@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
          }
        } else {
          // If is not in runtime, we will set default value(-1) for ScalarArray
-          int64_t num_ele = 0;
          std::vector<VarDesc*> vars;
          vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); i++) {
+          for (size_t i = 0; i < infershape_inputs.size(); ++i) {
            vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
          }

+          int64_t num_ele = 0;
          if (vars.size() == 1) {
            num_ele = 1;
            const auto& tensor_dims = vars[0]->GetShape();
@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
              num_ele *= tensor_dims[i];
            }
          } else {
-            for (auto& var : vars) {
-              const auto& tensor_dims = var->GetShape();
-              PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
-                                platform::errors::InvalidArgument(
-                                    "The shape is constructed by multi-tensor, "
-                                    "every tensor's dims should be 1. But your "
-                                    "shape has tensor that dims is %s.",
-                                    tensor_dims.size()));
-              num_ele += tensor_dims[0];
-            }
+            num_ele = vars.size();
          }
          phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
          tensor_attr.SetFromTensor(true);
@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
            std::type_index(typeid(std::vector<int32_t>))) {
          infer_meta_context.EmplaceBackAttr(std::move(
              phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
        } else {
          PADDLE_THROW(platform::errors::Unimplemented(
              "Unsupported cast op attribute `%s` to ScalarArray when "
-              "construct KernelContext.",
+              "construct InferMetaContext.",
              attr_name));
        }
      }
@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
        }
      } else if (ctx->HasInput(attr_name)) {
        const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
-
        if (infershape_input.size() == 1) {
          if (ctx->IsRuntime()) {
            Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
            "Unsupported attribute type is received when call "
            "InferShapeFunctor."));
      }
+    } else {
+      // do nothing
+    }
+  }
+
+  for (auto& out_name : output_names) {
+    if (ctx->HasOutputs(out_name)) {
+      auto output_var = ctx->GetOutputVarPtrs(out_name);
+      if (output_var.size() == 1) {
+        infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
+            output_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
+        outputs.reserve(output_var.size());
+        for (const auto& out : output_var) {
+          outputs.emplace_back(
+              std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackOutputs(std::move(outputs));
+      }
+    } else {
+      infer_meta_context.EmplaceBackOutput({nullptr});
    }
  }


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
-pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)

--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
-
-#include <cmath>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Node;
-
-#define GET_CONV_BN_NODES(pattern_name)                                    \
-  /* OPERATORS */                                                          \
-  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
-  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
-  /* CONV inputs */                                                        \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
-  /* CONV outputs */                                                       \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
-  /* Affine Channel inputs */                                              \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
-  /* Affine channel outputs */                                             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
-
-void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
-                                const ir::Node& ac_scale,
-                                const LoDTensor& ac_bias_tensor,
-                                LoDTensor* eltwise_y_in_tensor) {
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using ConstEigenVectorArrayMap =
-      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using EigenMatrixArrayMap = Eigen::Map<
-      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(
-      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
-      platform::errors::InvalidArgument(
-          "Tensor elementwise y(%d) and activation bias(%d) must have same "
-          "dimension.",
-          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
-
-  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
-
-  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-                                       scale_tensor->numel(), 1);
-  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
-                                         ac_bias_tensor.numel(), 1);
-
-  EigenVectorArrayMap eltwise_y_in_array(
-      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-      eltwise_y_in_tensor->numel(), 1);
-
-  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
-
-  // Re-compute weight of conv2d from AffineChannel
-  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  auto weights_shape = weights->dims();
-  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
-  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
-
-  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
-                                       weights_shape_2d[1]);
-
-  weights_array_2d.colwise() *= scale_array;
-
-  // Check for subnormal values that slows down convolution execution
-  for (int i = 0; i < weights->numel(); ++i) {
-    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
-  }
-}
-
-ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvAffineChannel fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
-                                 "it's wrong if data_format of conv is not "
-                                 "NCHW.";
-    }
-
-    // Get affine_channel bias for resizing eltwise_y!
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    // Create eltwise_y (conv bias) variable
-    VarDesc eltwise_y_in_desc(
-        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
-    // Set shape && datatype manually
-    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
-    eltwise_y_in_desc.SetDataType(
-        framework::TransToProtoVarType(ac_bias_tensor->dtype()));
-    eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
-    eltwise_y_in_desc.SetPersistable(true);
-
-    // Initialize eltwise_y
-    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
-    auto* eltwise_y_in_tensor =
-        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
-    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
-    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-                eltwise_y_in_tensor->numel(), 0.0f);
-
-    // update weights and biases
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // create an elementwise add node.
-    OpDesc desc;
-    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-    desc.SetType("elementwise_add");
-    desc.SetAttr("axis", 1);
-    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
-
-    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
-
-    IR_NODE_LINK_TO(conv_out, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_op, ac_out);
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_ac_count);
-}
-
-ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING)
-          << "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvBN fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
-                                 "enabled, it's wrong if data_format of conv "
-                                 "is not NCHW.";
-    }
-    // OPERATORS
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
-    // BIAS inputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
-    // BIAS outputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
-
-    // Get eltwise_y (conv bias) variable
-    auto* eltwise_y_in_tensor =
-        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
-
-    // Get batch norm bias
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // Update the elementwise_add node
-    eltwise->Op()->SetAttr("axis", 1);
-    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-
-    GraphSafeRemoveNodes(graph,
-                         {ac_scale, ac_bias, affine_channel, eltwise_out});
-
-    IR_NODE_LINK_TO(eltwise, ac_out);
-
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_conv_ac_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvAffineChannelFusePass);
-REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
-REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .EQ("affine_channel", 0));
-REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .LE("elementwise_add", 1)
-            .EQ("affine_channel", 0));
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
    }
    pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
+  VLOG(4) << "Done inputs";

  for (size_t i = 0; i < output_names.size(); ++i) {
    auto it = ctx.outputs.find(output_names[i]);
@@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
            "Unsupported output `%s` type when call pt kernel.",
            framework::ToTypeName(var->Type())));
      }
-
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
    }

    pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
  }
+  VLOG(4) << "Done outputs";

  for (size_t i = 0; i < attr_names.size(); ++i) {
    if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
@@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
      }
    }
  }
+  VLOG(4) << "Done attributes";
 }

 }  // namespace framework

--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
    library_type = LibraryType::kMKLDNN;
  } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
    library_type = LibraryType::kCUDNN;
+  } else if (kernel_key.backend() == phi::Backend::KPS) {
+    library_type = LibraryType::kKP;
  } else {
    // do nothing
  }
@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
    backend = phi::Backend::MKLDNN;
  } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
    backend = phi::Backend::GPUDNN;
+  } else if (kernel_type.library_type_ == LibraryType::kKP) {
+    backend = phi::Backend::KPS;
  } else {
    // do
  }
@@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor(
  dense_tensor->ResetHolder(shared_allocation);
 }

-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place) {
-  if (phi::DenseTensor::classof(tensor)) {
-    auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
-    if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(dense_tensor, place);
-    }
-  } else if (phi::SelectedRows::classof(tensor)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
-    if (!selected_rows->value().IsInitialized() ||
-        !(selected_rows->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
-                                               place);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported tensor type is received when setting allocation for "
-        "output tensor."));
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -62,9 +62,6 @@ class KernelArgsNameMaker {

 void InitDefaultKernelSignatureMap();

-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place);
-
 // TODO(Wilber): support others device context.
 template <typename T>
 struct ConvertToPhiContext {

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext(
            "Unsupported output `%s` type when call pt kernel.",
            framework::ToTypeName(var->Type())));
      }
-
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      framework::SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -75,9 +75,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }

 const std::vector<std::string> kTRTSubgraphPasses({
-  "conv_affine_channel_fuse_pass",  //
  "adaptive_pool2d_convert_global_pass",
-      "conv_eltwiseadd_affine_channel_fuse_pass",  //
      "shuffle_channel_detect_pass",          //
      "quant_conv2d_dequant_fuse_pass",       //
      "delete_quant_dequant_op_pass",         //
@@ -136,8 +134,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
    //   "identity_scale_op_clean_pass",             //
    "is_test_pass",                               //
        "simplify_with_basic_ops_pass",           //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
        "conv_bn_fuse_pass",                      //
        "conv_eltwiseadd_bn_fuse_pass",           //
        "embedding_eltwise_layernorm_fuse_pass",  //
@@ -239,8 +235,6 @@ void CpuPassStrategy::EnableMKLDNN() {
             "depthwise_conv_mkldnn_pass",    //
             "conv_bn_fuse_pass",             // Execute BN passes again to
             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_affine_channel_fuse_pass",  //
-             "conv_eltwiseadd_affine_channel_fuse_pass",  //
             "conv_transpose_bn_fuse_pass",   //
             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
             "conv_bias_mkldnn_fuse_pass",              //

--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
      ops::CastOpKernel<CUDA, plat::complex<float>>,                      \
      ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);

-#if !defined(PADDLE_WITH_HIP)
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
 REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
-#else
-REGISTER_CAST_CUDA_BASE(transfer_dtype)
-#endif
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"

+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"

 #ifdef PADDLE_WITH_MKLDNN
@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(0),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in concat op should > 0. But "
-            "received inputs' length is 0."));
-    if (inputs_num == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims =
-          phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      size_t axis =
-          ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
-                      static_cast<int64_t>(inputs_dims[0].size()));
-      framework::DDim out_dims =
-          phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
-      if (out_dims[axis] < 0) {
-        out_dims[axis] = -1;
-      }
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
+
+DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PT_INFER_META(phi::ConcatInferMeta));
+
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                  ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
+                  ConcatInferShapeFunctor);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
                  ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
                  ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,

--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -20,5 +20,5 @@ else()
 endif()

 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
 file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/controlflow/logical_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"

 namespace paddle {
 namespace operators {
@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

 REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
-                               paddle::operators::LogicalAndFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
-                               paddle::operators::LogicalOrFunctor);
 REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
-                              paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
-                               paddle::operators::LogicalXorFunctor);
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/logical_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using InT = typename Functor::ELEMENT_TYPE;
-    using OutT = bool;
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-
-    if (ins.size() == 1) {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_name,                                                                 \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>,    \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>,  \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>,     \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
-
-REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
-#undef REGISTER_LOGICAL_CUDA_KERNEL
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
-  template <typename T>                                      \
-  struct func_name {                                         \
-    using ELEMENT_TYPE = T;                                  \
-    HOSTDEVICE bool operator()(const T a, const T b) const { \
-      return static_cast<bool>(a) op static_cast<bool>(b);   \
-    }                                                        \
-  };
-
-LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
-LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
-LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
-#undef LOGICAL_BINARY_FUNCTOR
-
-template <typename T>
-struct LogicalNotFunctor {
-  using ELEMENT_TYPE = T;
-  HOSTDEVICE bool operator()(const T a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
-                                                          binary_func, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor unary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(),
-          out->mutable_data<bool>(context.GetPlace()), unary_func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor)              \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<                 \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
-
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor)               \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<                  \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {

--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -14,6 +14,10 @@

 #include "paddle/fluid/operators/dot_op.h"

+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {

@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(X) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Y) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
-                      platform::errors::PreconditionNotMet(
-                          "Output(Out) of DotOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = static_cast<size_t>(x_dims.size());
-    PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The dimensions of input tensor X (%s) "
-                          "should be 1 or 2",
-                          x_dims.to_str()));
-
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        true, x_rank == (size_t)y_dims.size(),
-        platform::errors::PreconditionNotMet(
-            "ShapeError: The shape of input tensor Y: %s should match with "
-            "input tenosr X: %s",
-            y_dims.to_str(), x_dims.to_str()));
-    bool shape_match = true;
-    for (size_t i = 0; i < x_rank; ++i) {
-      if (x_dims[i] != y_dims[i]) {
-        shape_match = false;
-        break;
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(true, shape_match,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The shape of input tensor X: %s should "
-                          "be exactly the same "
-                          "with input tensor Y: %s",
-                          x_dims.to_str(), y_dims.to_str()));
-    auto dims = vectorize(x_dims);
-    dims[dims.size() - 1] = 1;
-    ctx->SetOutputDim("Out", phi::make_ddim(dims));
-  }
-
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {

 namespace ops = paddle::operators;

+DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PT_INFER_META(phi::DotInferMeta));
+
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                  ops::DotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::DotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::DotOpGradMaker<paddle::imperative::OpBase>,
+                  DotInferShapeFunctor);

 REGISTER_OPERATOR(dot_grad, ops::DotGradOp);


--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
                       ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>);
+                       ops::GatherOpKernel<int64_t>,
+                       ops::GatherOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                       ops::GatherGradientOpKernel<double>,
                       ops::GatherGradientOpKernel<int>,
                       ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>);
+                       ops::GatherGradientOpKernel<int64_t>,
+                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_VERSION(gather)
    .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                   paddle::framework::compatible::OpVersionDesc().NewInput(

--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                        ops::GatherOpCUDAKernel<double>,
                        ops::GatherOpCUDAKernel<int64_t>,
                        ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<plat::float16>);
+                        ops::GatherOpCUDAKernel<plat::float16>,
+                        ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
                        ops::GatherGradOpCUDAKernel<double>,
                        ops::GatherGradOpCUDAKernel<int64_t>,
                        ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>);
+                        ops::GatherGradOpCUDAKernel<plat::float16>,
+                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -29,6 +29,7 @@ namespace operators {

 using DataLayout = framework::DataLayout;
 enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+#define ALIGN_BYTES 16

 #define CHECK_CASE(i, flags, kernel_name, ...)                              \
  if (i == flags) {                                                         \
@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
 template <typename T>
 __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
                                              int imsize, int groups,
-                                              int group_size, T* mean, T* var,
-                                              const DataLayout data_layout) {
+                                              int group_size, T* mean, T* var) {
  int gid = blockIdx.y;
  int cid = blockIdx.x;
  int bid = blockIdx.z;
@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
  T x_mean = 0, x_var = 0;
  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
    T val;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
-    } else {
    int hid = imid / W;
    int wid = imid % W;
    val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+
    x_mean += val;
    x_var += val * val;
  }
@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
  CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }

+template <typename T, typename AccT, int VecSize>
+__device__ __forceinline__ void ThreadReduce(const T* input, int size,
+                                             const int offset, AccT* mean,
+                                             AccT* var) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  if (offset > 0) {
+    input -= offset;
+    size += offset;
+    if (tid >= offset) {
+      AccT temp = input[tid];
+      *mean += temp;
+      *var += temp * temp;
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      AccT temp = ins[i];
+      *mean += temp;
+      *var += temp * temp;
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    AccT temp = input[tid];
+    *mean += temp;
+    *var += temp * temp;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
+  int i = blockIdx.x;
+  T x_mean = 0, x_var = 0;
+  for (int j = threadIdx.x; j < size; j += blockDim.x) {
+    T val;
+    val = x[i * size + j];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= size;
+  x_var /= size;
+  CudaAtomicAddWithWarp(&mean[i], x_mean);
+  CudaAtomicAddWithWarp(&var[i], x_var);
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
+                                            int size) {
+  int i = blockIdx.x;
+  AccT x_mean = static_cast<AccT>(0);
+  AccT x_var = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * size;
+  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_mean, kps::AddFunctor<AccT>());
+  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_var, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[i] = static_cast<T>(x_mean / size);
+    var[i] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T, int flags>
 __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
                                 const T* scale, const T* bias, int N, int C,
@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
  int H = imsize / W;
  int ccid = gid * group_size + cid;
  if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
+  auto ng = bid * groups + gid;
+  T x_mean = mean[ng];
+  T x_var = var[ng];
  x_var = x_var - x_mean * x_mean;
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  T var_inv = rsqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) {
+    real_var[ng] = x_var;
+  }
  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
    T val;
    int hid, wid;
+    int index = (bid * C + ccid) * imsize + imid;
    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
+      val = x[index];
    } else {
      hid = imid / W;
      wid = imid % W;
      val = x[(bid * H + hid) * W * C + wid * C + ccid];
    }
    val = (val - x_mean) * var_inv;
-    if (flags & kHasScale) val *= scale[gid * group_size + cid];
-    if (flags & kHasBias) val += bias[gid * group_size + cid];
+    if (flags & kHasScale) {
+      val *= scale[ccid];
+    }
+    if (flags & kHasBias) {
+      val += bias[ccid];
+    }
    if (data_layout == DataLayout::kNCHW) {
-      y[(bid * C + ccid) * imsize + imid] = val;
+      y[index] = val;
    } else {
      y[(bid * H + hid) * W * C + wid * C + ccid] = val;
    }
@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
        imsize *= x_dims[i];
      }
    }
+
 #ifdef __HIPCC__
    int block_size = std::max(std::min(256, imsize), 64);
 #else
    int block_size = std::min(1024, imsize);
 #endif
+
    dim3 grid(group_size, groups, x_dims[0]);
    dim3 threads(block_size, 1, 1);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      int size = group_size * imsize;
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(size / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 grids(x_dims[0] * groups);
+      dim3 blocks(block_size_nchw);
+      if (size < vec_size) {
+        ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      } else {
+        VectorizedGetMeanAndVarNCHW<
+            T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      }
+    } else {
      GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
          x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
-        temp_var_data, data_layout);
+          temp_var_data);
+    }
    int flags =
        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
    UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,

--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/index_sample_op.h"
 #include <vector>
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"

+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
 class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class IndexSampleOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Input) of FindByIndex should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Index) of FindByIndex should not be null."));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(X) shape of IndexSample op should be 2-D, but "
-            "got X's shape = [%s], please check X shape.",
-            input_dims));
-
-    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(Index) shape of IndexSample op should be 2-D, but "
-            "got Index's shape [%s] , please check index shape.",
-            input_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Inputs(X)'s value of dimension 0 must same with "
-                            "Inputs(Index)'s value of dimension 0, but "
-                            "got %d of Inputs(X), and got %d of Inputs(Index), "
-                            "please check Inputs shape.",
-                            input_dims[0], index_dims[0]));
-    }
-    ctx->SetOutputDim("Out", index_dims);
-    auto type = ctx->GetInputsVarType("Index")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Index", /*->*/ "Out");
-    }
-  }

 protected:
  framework::OpKernelType GetExpectedKernelType(
@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle

 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PT_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                  ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
+                  IndexSampleInferShapeFunctor);
 REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
                  ops::IndexSampleGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_sample_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define PREDEFINED_BLOCK_SIZE_X 512
-#define PREDEFINED_BLOCK_SIZE 1024
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-namespace paddle {
-namespace operators {
-
-namespace {
-void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
-                          .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
-  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
-}
-}
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
-                                   T* out_data, size_t index_length,
-                                   size_t input_length, size_t batch_size) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
-    }
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
-                                const T* out_grad, size_t index_length,
-                                size_t input_length, size_t batch_size,
-                                bool same_data_in_row = true) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      if (same_data_in_row) {
-        platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
-                                out_grad[sample_idx]);
-      } else {
-        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
-      }
-    }
-  }
-}
-
-template <typename T>
-class IndexSampleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* index = ctx.Input<LoDTensor>("Index");
-    auto* output = ctx.Output<LoDTensor>("Out");
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    const auto* in_data = input->data<T>();
-    auto* out_data = output->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-
-    auto input_dim = input->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = input_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    int block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    }
-  }
-};
-
-template <typename T>
-class IndexSampleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<LoDTensor>("Index");
-
-    const auto* output_grad_data = output_grad->data<T>();
-    auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    auto input_num = input_grad->numel();
-    auto input_dim = input_grad->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = index_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-    bool same_data_in_index_row = index_length == 1 ? false : true;
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    auto block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/index_sample_op.h
+++ b/paddle/fluid/operators/index_sample_op.h
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T, typename IndexT = int>
-void IndexSampleInner(const framework::ExecutionContext &context,
-                      const LoDTensor &input, const LoDTensor &index,
-                      LoDTensor *output) {
-  auto input_dims = input.dims();
-  auto index_dims = index.dims();
-
-  int batch_size = input_dims[0];
-  auto value_length = input_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> input_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(input, context.device_context(),
-                                    &input_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  std::vector<T> res(index_ids_num);
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    T v = input_vec[v_i];
-    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
-            << " value = " << v;
-    res[i] = v;
-  }
-
-  auto ddim = phi::make_ddim({batch_size, index_length});
-  output->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(res, context.device_context(), output);
-  output->Resize(ddim);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input_var = ctx.InputVar("X");
-    auto *index_var = ctx.InputVar("Index");
-
-    auto &input_tensor = input_var->Get<LoDTensor>();
-    auto &index_tensor = index_var->Get<LoDTensor>();
-
-    auto *out_var = ctx.OutputVar("Out");
-    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
-    }
-  }
-};
-
-template <typename T, typename IndexT = int>
-void IndexSampleGradInner(const framework::ExecutionContext &context,
-                          const LoDTensor &out_grad, const LoDTensor &index,
-                          LoDTensor *x_grad) {
-  std::vector<T> out_grad_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(out_grad, context.device_context(),
-                                    &out_grad_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  auto index_dims = index.dims();
-  auto x_grad_dims = x_grad->dims();
-
-  auto value_length = x_grad_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> x_grad_vec(x_grad->numel(), 0);
-
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    x_grad_vec[v_i] += out_grad_vec[i];
-  }
-  x_grad->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
-  x_grad->Resize(x_grad_dims);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *index_var = context.InputVar("Index");
-    auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
-
-    auto &index_tensor = index_var->Get<LoDTensor>();
-    auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
-    auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
-                                   x_grad_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
-                                       x_grad_tensor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/index_sample_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {

--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
    for (int it = 0; it < LDGS; it++) {
 #pragma unroll
      for (int jt = 0; jt < VecSize; jt++) {
-        U x_tmp = x[it][jt];
+        U x_tmp = static_cast<U>(x[it][jt]);
        U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
        U dy_tmp = static_cast<U>(gamma[it][jt]) *
                   static_cast<U>(dout[it][jt]);    // scale * dy
-        U dout_tmp = dout[it][jt];                // dy
+        U dout_tmp = static_cast<U>(dout[it][jt]);  // dy

        // used for get dx (row reduction)
        sum_loss1 += dy_tmp;          // scale * dy, sum_1

--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
                             plat::float16>);
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::bfloat16>);
 #else
 REGISTER_OP_CUDA_KERNEL(
    layer_norm,

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/operators/mkldnn/axpy_handler.h"
@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
    out.mutable_value()->mutable_data<T>(
        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
        context.GetPlace());
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));

    std::unordered_map<int64_t, size_t> rows_to_id;
    for (size_t i = 0; i < merge_rows.size(); ++i) {
      rows_to_id[merge_rows[i]] = i;
    }

-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
+    auto* y_data = out.mutable_value()->data<T>();
+    auto* x_data = input.value().data<T>();
+    int xm = input_rows.size();
+    int ym = merge_rows.size();
    int n = input_width;
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_to_id[input_rows[i]];
-      auto r = xpu::add(context.x_context(), &input_data[i * input_width],
-                        &out_data[out_i * input_width],
-                        &out_data[out_i * input_width], n);
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                     XPUAPIErrorMsg[r]));
-    }
+
+    xpu::ctx_guard RAII_GUARD(context.x_context());
+    int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+    int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+    memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                 merge_rows.data(), ym * sizeof(int64_t));
+    memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                 input_rows.data(), xm * sizeof(int64_t));
+    int r =
+        xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                        x_rows_data, y_rows_data, xm, n, ym);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
  }

  void operator()(const platform::XPUDeviceContext& context,
@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
            {static_cast<int64_t>(merged_row_set.size()), input_width}),
        context.GetPlace());

-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-
-    float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+    float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());

    std::unordered_map<int64_t, size_t> rows_to_id;
    for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
      }
      auto& input_rows = input->rows();

+      auto* x_data = input->value().data<T>();
+      int xm = input_rows.size();
+      int ym = merge_rows.size();
      int n = input_width;
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        auto r = xpu::add(
-            context.x_context(), input->value().data<T>() + i * input_width,
-            &out_data[out_i * input_width], &out_data[out_i * input_width], n);
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                       XPUAPIErrorMsg[r]));
-      }
+
+      xpu::ctx_guard RAII_GUARD(context.x_context());
+      int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+      int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+      memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                   merge_rows.data(), ym * sizeof(int64_t));
+      memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                   input_rows.data(), xm * sizeof(int64_t));
+      int r =
+          xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                          x_rows_data, y_rows_data, xm, n, ym);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
    }
  }
 };

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
 template struct MergeAdd<platform::CUDADeviceContext,
                         platform::complex<double>>;

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
              "The fp32 beta1 power accumulator tensor. Its shape is [1].");
    AddOutput("Beta2Pow",
              "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddOutput("FusedIndices",
-              "The param index of each element in FP32FusedParam. Its shape is "
-              "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
    AddOutput(
        "FusedParamOffsets",
        "The numel offset of each parameter inside the FP32FusedParam. Its "
        "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddOutput("FP32ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp32_local_param_num + 1].");
-    AddOutput("FP16ShardFusedParamOffsets",
+        "+ n_2, ...]. It should be in CPUPlace.");
+    AddOutput(
+        "FP32ShardFusedParamOffsets",
        "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp16_local_param_num + 1].");
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
    AddOutput(
-        "WeightDecay",
-        "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
    AddOutput("ParamInfo",
              "The param info. It should be in CPUPlace, and its shape is [6]"
-              "CPUPlace, and its shape is [6]. It is "
+              "CPUPlace, and its shape is [8]. It is "
              "[fp32_shard_param_start_idx, fp32_local_param_num, "
-              "fp32_global_param_num, fp16_shard_param_start_idx, "
-              "fp16_local_param_num, fp16_global_param_num].");
-
+              "fp32_global_param_num, fp32_weight_decay_end_idx, "
+              "fp16_shard_param_start_idx, "
+              "fp16_local_param_num, fp16_global_param_num, "
+              "fp16_weight_decay_end_idx].");
+    AddOutput("ParamOrder",
+              "The reordered parameter order. Inside this op, "
+              "the parameter would be reordered by data type and weight decay "
+              "value.");
    AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
    AddOutput("MasterParamOut",
              "The output master parameter list. It would share the memory of "
@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker

    AddAttr<float>("beta1", "The initial value of Beta1Pow.");
    AddAttr<float>("beta2", "The initial value of Beta2Pow.");
-    AddAttr<std::vector<float>>(
-        "weight_decay",
-        "The weight decay for each parameter. Its "
-        "shape is equal to the global parameter number.");
+    AddAttr<std::vector<int>>("apply_weight_decay",
+                              "Whether to apply weight decay.");
    AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
    AddAttr<int>("rank", "The global rank of the current process.");
    AddAttr<int>("nranks", "The global world size.");

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
           << ") , dtype = " << fused_out->dtype();
 }

-template <typename OffsetT, typename IndexT>
-static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
-                                                      IndexT *out,
-                                                      int offset_num,
-                                                      int out_num) {
-  CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
-    auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
-    if (idx == offset_num || offsets[idx] != i) {
-      --idx;
-    }
-    out[i] = idx;
-  }
-}
-
-template <typename T>
-static void CopyVectorToTensor(const std::vector<T> &src,
-                               framework::Tensor *dst,
-                               const platform::Place &place,
-                               gpuStream_t stream) {
-  dst->Resize({static_cast<int64_t>(src.size())});
-  T *dst_ptr = dst->mutable_data<T>(place);
-  const T *src_ptr = src.data();
-  auto nbytes = src.size() * sizeof(T);
-  memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
-}
-
 template <typename T>
 static void CopyVectorToCPUTensor(const std::vector<T> &src,
                                  framework::Tensor *dst) {
@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
  std::memcpy(dst_ptr, src_ptr, nbytes);
 }

+static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
+                                       std::vector<ParamGradInfo> *infos) {
+  size_t n = infos->size();
+  std::vector<int> cur_flags;
+  cur_flags.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = (*infos)[i].idx;
+    cur_flags.push_back(flags[idx]);
+  }
+
+  auto origin_infos = *infos;
+  size_t j = 0;
+  for (size_t i = 0; i < n; ++i) {
+    if (cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  size_t ret_idx = j;
+
+  for (size_t i = 0; i < n; ++i) {
+    if (!cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  return ret_idx;
+}
+
+template <typename T>
+static T ClipByBound(T x, T low_value, T high_value) {
+  if (x < low_value) return low_value;
+  if (x > high_value) return high_value;
+  return x;
+}
+
 template <typename T>
 class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
        info->numel_offset = 0;        // not determined yet
      }
    }
+    const auto &apply_weight_decay =
+        ctx.Attr<std::vector<int>>("apply_weight_decay");
+    size_t fp32_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
+    size_t fp16_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
+
+    auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
+    auto param_num = fp32_infos.size() + fp16_infos.size();
+    param_order_t->Resize({static_cast<int16_t>(param_num)});
+    auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
+    for (size_t i = 0; i < fp32_infos.size(); ++i) {
+      param_order[i] = static_cast<int>(fp32_infos[i].idx);
+    }
+    for (size_t i = 0; i < fp16_infos.size(); ++i) {
+      param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
+    }
+
    VLOG(10) << "Fill ParamGradInfo ends";

    // Step 2: determine the numel_with_padding and numel_offset
@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
    VLOG(10) << "Found the sharding arguments";

    auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
-    param_info_t->Resize({6});
+    param_info_t->Resize({8});
    auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
    param_info[0] = static_cast<int>(fp32_start_idx);
    param_info[1] = static_cast<int>(fp32_local_param_num);
    param_info[2] = static_cast<int>(fp32_infos.size());
-    param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size());
-    param_info[4] = static_cast<int>(fp16_local_param_num);
-    param_info[5] = static_cast<int>(fp16_infos.size());
+    param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
+                                     fp32_start_idx + fp32_local_param_num) -
+                    static_cast<int>(fp32_start_idx);
+    param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
+    param_info[5] = static_cast<int>(fp16_local_param_num);
+    param_info[6] = static_cast<int>(fp16_infos.size());
+    param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
+                                     fp16_start_idx + fp16_local_param_num) -
+                    static_cast<int>(fp16_start_idx);

    VLOG(10) << "Start FP32 idx: " << param_info[0];
    VLOG(10) << "Local FP32 param num: " << param_info[1];
    VLOG(10) << "Global FP32 param num: " << param_info[2];

-    VLOG(10) << "Start FP16 idx: " << param_info[3];
-    VLOG(10) << "Local FP16 param num: " << param_info[4];
-    VLOG(10) << "Global FP16 param num: " << param_info[5];
+    VLOG(10) << "Start FP16 idx: " << param_info[4];
+    VLOG(10) << "Local FP16 param num: " << param_info[5];
+    VLOG(10) << "Global FP16 param num: " << param_info[6];

-    // For WeightDecay, shard and perform H2D copy
-    const auto &origin_weight_decay =
-        ctx.Attr<std::vector<float>>("weight_decay");
-    PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "The attr(weight_decay) should have the "
-                          "same length with Input(Param)."));
-    std::vector<float> shard_weight_decay;
-    shard_weight_decay.reserve(total_local_param_num);
-    for (size_t i = 0; i < fp32_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
-    }
-    for (size_t i = 0; i < fp16_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
-    }
-
-    // For FusedIndices, launch CUDA kernel to do binary search
-    auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
-    fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
-    auto *fused_indices = fused_indices_t->mutable_data<int>(place);
    std::vector<int> numel_offsets;
    numel_offsets.reserve(params.size() + 1);
    for (const auto &info : fp32_infos) {
@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                          "The numel_offsets number must be one larger than "
                          "the parameter number."));
    VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
-    auto *fused_param_offset_t =
-        ctx.Output<framework::Tensor>("FusedParamOffsets");
-    fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
-    auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
-    memory::Copy(place, fused_param_offset, platform::CPUPlace(),
-                 numel_offsets.data(),
-                 numel_offsets.size() * sizeof(numel_offsets[0]), stream);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
-    LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
-                                     config.thread_per_block, 0, stream>>>(
-        fused_param_offset, fused_indices, numel_offsets.size() - 1,
-        total_numel);
-
-    std::vector<int> lengths;
-    lengths.reserve(fp32_local_param_num + fp16_local_param_num);

    std::vector<int> fp32_partial_numel_offsets;
    fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
      VLOG(10) << "FP32 Partial numel = ["
               << valid_start_n + fp32_infos[i].numel << ","
               << end_n + fp32_infos[i].numel;
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
      fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
    }

    std::vector<int> fp16_partial_numel_offsets;
@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
      PADDLE_ENFORCE_NE(valid_start_n, end_n,
                        platform::errors::InvalidArgument(
                            "Indices sharding error. This may be a bug."));
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
      fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
    }

    CopyVectorToCPUTensor(numel_offsets,
@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
        fp16_partial_numel_offsets,
        ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));

-    // Fill the weight decay tensor
-    PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    std::vector<float> wd_cpu;
-    for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
-      int len = lengths[i];
-      for (int j = 0; j < len; ++j) {
-        wd_cpu.push_back(shard_weight_decay[i]);
-      }
-    }
-    PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
-                       place, stream);
-
    auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
    if (!global_scale->IsInitialized()) {
      TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
             "The fp32 beta1 power accumulator tensor. Its shape is [1].");
    AddInput("Beta2Pow",
             "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddInput("FusedIndices",
-             "The param index of each element in FP32FusedParam. Its shape is "
-             "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
    AddInput(
        "FusedParamOffsets",
        "The numel offset of each parameter inside the FP32FusedParam. Its "
        "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddInput("FP32ShardFusedParamOffsets",
+        "+ n_2, ...]. It should be in CPUPlace.");
+    AddInput(
+        "FP32ShardFusedParamOffsets",
        "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp32_local_param_num + 1].");
-    AddInput("FP16ShardFusedParamOffsets",
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddInput(
+        "FP16ShardFusedParamOffsets",
        "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp16_local_param_num + 1].");
-    AddInput("WeightDecay",
-             "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
    AddInput("ParamInfo",
             "The param info. It should be in CPUPlace, and its shape is [6]"
-             "CPUPlace, and its shape is [6]. It is "
+             "CPUPlace, and its shape is [8]. It is "
             "[fp32_shard_param_start_idx, fp32_local_param_num, "
-             "fp32_global_param_num, fp16_shard_param_start_idx, "
-             "fp16_local_param_num, fp16_global_param_num].");
+             "fp32_global_param_num, fp32_weight_decay_end_idx, "
+             "fp16_shard_param_start_idx, "
+             "fp16_local_param_num, fp16_global_param_num, "
+             "fp16_weight_decay_end_idx].");
+    AddInput("ParamOrder",
+             "The reordered parameter order. Inside this op, "
+             "the parameter would be reordered by data type and weight decay "
+             "value.");

    AddInput("LearningRate",
             "The fp32 learning rate tensor. Its shape is [1].");
@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
        "max_global_grad_norm",
        "The maximum global gradient l2-norm value for clipping. If "
        "max_global_grad_norm <= 0, no clipping would be performed.");
+    AddAttr<float>("weight_decay", "The weight decay value.");
    AddAttr<bool>("clip_after_allreduce",
                  "Whether to clip before allreduce, only valid when the "
                  "world size is larger than 1.");

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
          args...);
 }

-template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
-          int MaxChunkNumPerLaunch, typename... Args>
-static void MultiTensorApply(Functor functor, gpuStream_t stream,
-                             const int *offsets, int n, int chunk_size,
-                             Args... args) {
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
+class MultiTensorLauncher {
+ public:
+  MultiTensorLauncher(
+      const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
+      const int &chunk_id, const int &chunk_size, const int &block_dim,
+      const gpuStream_t &stream)
+      : meta_(meta),
+        chunk_id_(chunk_id),
+        chunk_size_(chunk_size),
+        block_dim_(block_dim),
+        stream_(stream) {}
+
+  template <typename Functor, typename... Args>
+  void Launch(Functor &&functor, Args &&... args) const {
+    MultiTensorApplyCUDAKernel<
+        Functor, MaxTensorNumPerLaunch,
+        MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
+        functor, meta_, chunk_size_, args...);
+  }
+
+ private:
+  const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
+  const int &chunk_id_;
+  const int &chunk_size_;
+  const int &block_dim_;
+  const gpuStream_t &stream_;
+};
+
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename Callback>
+static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
+                                         int n, int chunk_size, int block_dim,
+                                         Callback &&callback) {
  if (n == 0) return;

  constexpr auto NumTensor = MaxTensorNumPerLaunch;
@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
  int numel_offset = 0;
  metas.start_tensor_id = 0;
  metas.start_chunk_id = 0;
+  int launch_num = 0;
+
+  MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
+      metas, chunk_id, chunk_size, block_dim, stream);
+
  for (int i = 0; i < n; ++i) {
    auto length = offsets[i + 1] - offsets[i];
    if (tensor_id == 0) {
@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
      bool last_chunk = (i + 1 == n && j + 1 == chunk_num);

      if (tensor_full || block_full || last_chunk) {
-        MultiTensorApplyCUDAKernel<Functor, NumTensor,
-                                   NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
-            functor, metas, chunk_size, args...);
+        callback(launcher, launch_num);
+        ++launch_num;
        chunk_id = 0;
        if (j + 1 == chunk_num) {  // chunk for the current tensor is full
          metas.start_chunk_id = 0;
@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
  }
 }

+template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename... Args>
+static void MultiTensorApply(Functor functor, gpuStream_t stream,
+                             const int *offsets, int n, int chunk_size,
+                             int block_dim, Args &&... args) {
+  auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
+                                                MaxChunkNumPerLaunch> &launcher,
+                      int i) { launcher.Launch(functor, args...); };
+  MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
+      stream, offsets, n, chunk_size, block_dim, callback);
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
 __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
  return static_cast<platform::float16>(abs(static_cast<float>(x)));
 }
+
+__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
+  return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
+}
+
 __device__ __forceinline__ float inline_abs(float x) { return abs(x); }
 __device__ __forceinline__ double inline_abs(double x) { return abs(x); }

@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
  return static_cast<platform::float16>(
      pow(static_cast<float>(base), static_cast<float>(exponent)));
 }
+__device__ __forceinline__ platform::bfloat16 inline_pow(
+    platform::bfloat16 base, platform::bfloat16 exponent) {
+  return static_cast<platform::bfloat16>(
+      pow(static_cast<float>(base), static_cast<float>(exponent)));
+}
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
  return pow(base, exponent);
 }
@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;

 REGISTER_OP_CUDA_KERNEL(p_norm,
                        ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
+                        ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
                        ops::PnormCUDAKernel<CUDA, float>,
                        ops::PnormCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(
    p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
+    ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
    ops::PnormGradCUDAKernel<CUDA, float>,
    ops::PnormGradCUDAKernel<CUDA, double>);
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
    CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
    CUDAReduceSumGradKernel<paddle::platform::float16>,
+    CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
    CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
    CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
    CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 #include <string>

+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of SplitOp should not be null."));
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "Outputs(Out) of SplitOp should not be empty."));
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
-    const size_t outs_number = outs_names.size();
-
-    if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          sections.size(), outs_number,
-          platform::errors::InvalidArgument("tensor split sections size "
-                                            "should be equal to output size."));
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
-      std::vector<framework::DDim> outs_dims(outs_number, out_dims);
-      ctx->SetOutputsDim("Out", outs_dims);
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-      return;
-    }
-
-    bool each_section_is_known =
-        (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
-
-    auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
-                                    in_dims, num, sections, axis, outs_number);
-    ctx->SetOutputsDim("Out", outs_dims);
-    if (axis != 0) {
-      // Only pass LoD when not spliting along the first dim.
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-    }
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
@@ -168,6 +125,10 @@ Example:

 namespace ops = paddle::operators;

+DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
+                            PT_INFER_META(phi::SplitInferMeta));
+
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
                  ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>);
+                  ops::SplitGradMaker<paddle::imperative::OpBase>,
+                  SplitInferShapeFunctor);
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    paddle::operators::UniformRandomOpVarTypeInference);

-REGISTER_OP_CPU_KERNEL(
-    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
-    paddle::operators::CPUUniformRandomKernel<double>,
-    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
    uniform_random_batch_size_like,
    paddle::operators::CPUUniformRandomKernel<float>,

--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
                        paddle::operators::GPUUniformRandomKernel<float>,
                        paddle::operators::GPUUniformRandomKernel<double>);
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/operators/where_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 namespace paddle {
 namespace operators {

@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
-
-    auto cond_dims = ctx->GetInputDim("Condition");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        cond_dims, x_dims,
-        platform::errors::InvalidArgument(
-            "The dims of Inputs(Condition) and Inputs(X) should be same. "
-            "But received Condition's shape is [%s], X's shape is [%s]",
-            cond_dims, x_dims));
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      platform::errors::InvalidArgument(
-                          "The dims of Inputs(X) and Inputs(Y) should be same. "
-                          "But received X's shape is [%s], Y's shape is [%s]",
-                          x_dims, y_dims));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle

 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PT_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                  ops::WhereOpGradMaker<paddle::framework::OpDesc>,
-                  ops::WhereOpGradMaker<paddle::imperative::OpBase>);
+                  ops::WhereOpGradMaker<paddle::imperative::OpBase>,
+                  WhereInferShapeFunctor);

 REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
                  ops::WhereGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/where_op.cu
+++ b/paddle/fluid/operators/where_op.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace platform = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CondFunctor {
-  HOSTDEVICE inline CondFunctor() {}
-
-  HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
-    return cond ? x : y;
-  }
-};
-
-template <typename T>
-__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
-                                const T* y, T* out) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out[idx] = cond[idx] ? x[idx] : y[idx];
-  }
-}
-
-template <typename T>
-__global__ void WhereGradCUDAKernel(const int N, const T* dout,
-                                    const bool* cond, T* dx, T* dy) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    if (dx != nullptr) {
-      dx[idx] = cond[idx] ? dout[idx] : 0.;
-    }
-    if (dy != nullptr) {
-      dy[idx] = cond[idx] ? 0. : dout[idx];
-    }
-  }
-}
-
-template <typename T>
-class WhereKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto numel = condition->numel();
-
-    // TODO(GaaoWei8): Input of where can be broadcast
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto functor = CondFunctor<T>();
-    std::vector<const framework::Tensor*> ins = {condition, X, Y};
-    std::vector<framework::Tensor*> outs = {out};
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-template <typename T>
-class WhereGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    const bool* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dout = dout_t->data<T>();
-    T* dx =
-        (dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dy =
-        (dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
-    WhereGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        numel, dout, cond_data, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    where_grad,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/where_op.h
+++ b/paddle/fluid/operators/where_op.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class WhereKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto x_numel = X->numel();
-    for (int i = 0; i < x_numel; i++) {
-      out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WhereGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::LoDTensor>("Condition");
-    const auto* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    auto* dout = dout_t->data<T>();
-    if (dx_t != nullptr) {
-      auto* dx = dx_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
-      }
-    }
-    if (dy_t != nullptr) {
-      auto* dy = dy_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {

--- a/paddle/fluid/operators/where_op_xpu.cc
+++ b/paddle/fluid/operators/where_op_xpu.cc
@@ -14,7 +14,7 @@

 #ifdef PADDLE_WITH_XPU

-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"

@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
 #endif
 #endif

+// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
+inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
+  bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
+  bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
+  return *reinterpret_cast<bfloat16 *>(&x);
+}
+
+static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
+  return *reinterpret_cast<__nv_bfloat16 *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
+                                    PDBF16ToCUDABF16(val)));
+}
+#else
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
  float *real = reinterpret_cast<float *>(address);
  float *imag = real + 1;

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -81,7 +81,7 @@ set(PYBIND_SRCS
  cuda_streams_py.cc)

 if(NOT ON_INFER)
-  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
+  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
  if (WITH_NCCL)
    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
  endif()

--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -23,6 +23,7 @@ limitations under the License. */

 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) {
                    [](distributed::ProcessGroupStrategy &self, int nrings) {
                      self.nrings_ = nrings;
                    });
+
+  m->def("eager_assign_group_by_size",
+         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+            std::vector<size_t> group_size_limits,
+            std::vector<int64_t> tensor_indices) {
+           auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+           return distributed::Eager_AssignGroupBySize(
+               tensors, is_sparse_gradient, group_size_limits, tensor_indices);
+         },
+         py::arg("tensors"), py::arg("is_sparse_gradient"),
+         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
+         py::arg("tensor_indices") = std::vector<int64_t>{},
+         py::call_guard<py::gil_scoped_release>());
 }

 }  // end namespace pybind

--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
    {"merged_momentum",
     {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
-    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
+    {"sparse_momentum",
+     {"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
    {"run_program", {"X", "Params"}},
    {"fused_feedforward",
@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
    {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
    {"run_program", {"DOut"}},
    {"adam",
@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
      "out_old_num_accumulates", "out_num_updates"}},
    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
    {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
    {"batch_norm", {"MeanOut", "VarianceOut"}},
    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
    {"accuracy", {"Correct", "Total"}},

--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
  );
 }

+// Type Constrait for concrete DenseTensor type.
+class DenseTensor<string target, string precision, string layout> :
+    Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
+    "!infrt.DenseTensor<"#target#","#precision#","#layout#">", 
+    "::infrt::DenseTensorType">;
+
 // Base class for infrt dialect attributes.
 class Infrt_Attr<string name, list<Trait> traits = [],
                   string baseCppClass = "::mlir::Attribute">

--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -21,8 +21,8 @@
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/tensor_shape.h"

 namespace infrt {

--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI)
    return()
 endif()

-#mlir_tablegen_on(infrt_phi_base DIALECT phi)
-add_mlir_dialect(infrt_phi_base phi)
-add_mlir_dialect(infrt_phi_tensor phi_dt)
-add_mlir_dialect(infrt_phi_kernel phi_kernel)
-#mlir_tablegen_on(infrt_phi_tensor)
-
-gather_srcs(infrt_src SRCS
-    phi_base.cc infrt_phi_tensor.cc
-    infrt_phi_tensor.cc)
-
+add_subdirectory(ir)
 add_subdirectory(pass)

 add_executable(phi-exec phi_exec.cc)

--- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
+#mlir_tablegen_on(infrt_phi_base DIALECT phi)
+add_mlir_dialect(infrt_phi_base phi)
+add_mlir_dialect(infrt_phi_tensor phi_dt)
+add_mlir_dialect(infrt_phi_kernel phi_kernel)
+#mlir_tablegen_on(infrt_phi_tensor)
+
+gather_srcs(infrt_src SRCS
+    phi_base.cc 
+    infrt_phi_tensor.cc)
--- a/paddle/infrt/dialect/phi/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_base.td
--- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"

 def PHI_KernelDialect : Dialect {
  let name = "phi_kernel";

--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"

 #include <mlir/IR/BuiltinTypes.h>

-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"

 namespace infrt {
 namespace phi {
@@ -25,7 +25,7 @@ namespace phi {
 void PHIDenseTensorDialect::initialize() {
 #define GET_OP_LIST
  addOperations<
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
      >();
 }

@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
 }  // namespace infrt

 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"  // NOLINT
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.h
@@ -29,11 +29,11 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>

-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"

 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
@@ -2,7 +2,7 @@
 #else
 #define PHI_TENSOR

-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"

--- a/paddle/infrt/dialect/phi/phi_base.cc
+++ b/paddle/infrt/dialect/phi/phi_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"

 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
@@ -21,8 +21,8 @@
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"

 namespace infrt {
 namespace phi {
@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type,
 void PHIDialect::initialize() {
  addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
      >();
  addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
      >();
 }

@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
 }  // namespace infrt

 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
--- a/paddle/infrt/dialect/phi/phi_base.h
+++ b/paddle/infrt/dialect/phi/phi_base.h
@@ -19,11 +19,13 @@

 #include <string>

-#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"

 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"

 namespace mlir {
 namespace OpTrait {

--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -73,7 +73,7 @@ using ValueVariantType =
            std::vector<phi::DenseTensor>,
            paddle::experimental::ScalarBase<phi::DenseTensor>,
            paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
-            std::vector<phi::MetaTensor>,
+            std::vector<phi::MetaTensor*>,
            phi::MetaConfig,
            paddle::experimental::Backend,
            paddle::experimental::DataLayout,

--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
  std::vector<Tensor> out;
  auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
  std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_number);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_number);
  for (size_t i = 0; i < out_number; ++i) {
    meta_outs.push_back(dense_outs[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
  }

  phi::SplitInferMeta(
-      MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs);
+      MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);

  using kernel_signature = void (*)(const platform::DeviceContext&,
                                    const phi::DenseTensor&,

--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
  return result;
 }

-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def) {
-  VLOG(5) << "ResetTensor by TensorArgDef.";
-  if (phi::DenseTensor::classof(dst)) {
-    auto* dense_t = static_cast<phi::DenseTensor*>(dst);
-    auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else if (phi::SelectedRows::classof(dst)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
-    auto* meta =
-        phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Unsupported tensor type is received when reseting tensor dtype and "
-        "layout by argument definition."));
-  }
-}
-
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
 phi::ScalarArray MakePhiScalarArrayFromVarList(
    const std::vector<framework::Variable*>& variable_list);

-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def);
-
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext {
 // must use different function name for cudnn kernel
 using GPUDNNContext = GPUContext;

+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using KPSContext = GPUContext;
+#endif
+
 }  // namespace phi
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
  std::unique_ptr<Impl> impl_;
 };

+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if PADDLE_WITH_XPU_KP
+using KPSContext = XPUContext;
+#endif
+
 }  // namespace phi
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
  MKLDNN,
  GPUDNN,  // cuDNN and hipDNN

+  // paddle kernel primitives backend
+  KPS,
+
  // end of backend types
  NUM_BACKENDS,

@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
    case Backend::GPUDNN:
      os << "GPUDNN";
      break;
+    case Backend::KPS:
+      os << "KPS";
+      break;
    default: {
      size_t device_type_id_ = static_cast<size_t>(backend) -
                               static_cast<size_t>(Backend::NUM_BACKENDS);
@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
    return Backend::MKLDNN;
  } else if (s == std::string("GPUDNN")) {
    return Backend::GPUDNN;
+  } else if (s == std::string("KPS")) {
+    return Backend::KPS;
  } else {
    return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                phi::GetOrRegisterGlobalDeviceTypeId(s));

--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_

 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)

 cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)


--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
    case phi::Backend::XPU:
      return phi::XPUPlace(
          set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
+#endif
+    case phi::Backend::KPS:
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      return phi::GPUPlace(
+          set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
+#elif defined(PADDLE_WITH_XPU_KP)
+      return phi::XPUPlace(
+          set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
    default: {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE

--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
  auto& kernel_info_map = custom_kernel_map.GetMap();
  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();

+  auto& kernels = KernelFactory::Instance().kernels();
  for (auto& pair : kernel_info_map) {
-    PADDLE_ENFORCE_EQ(
-        KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
-        true,
+    PADDLE_ENFORCE_NE(
+        kernels.find(pair.first),
+        kernels.end(),
        phi::errors::InvalidArgument(
            "The kernel %s is not ready for custom kernel registering.",
            pair.first));

    for (auto& info_pair : pair.second) {
-      auto& kernels = KernelFactory::Instance().kernels();
      PADDLE_ENFORCE_EQ(
          kernels[pair.first].find(info_pair.first),
          kernels[pair.first].end(),

--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
 // Note: When you reset holder, you need to ensure the offset is correct
 void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
  if (holder_) {
-    // TODO(zyfncg): The change of static_cast<> in check will recover back
-    // when SetAllocationForOutputTenosr is deleted.
-    // Now the numel() may return -1, and will cast to a very large number when
-    // compare with a data with unsigned long type, this will make checking
-    // failed, so it's a temporary solution to deal with this problem.
    PADDLE_ENFORCE_LE(
        numel() * static_cast<int64_t>(SizeOf(dtype())) +
            static_cast<int64_t>(meta_.offset),

--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
               : paddle::optional<const phi::MetaTensor&>{paddle::none};
 }

-std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
+std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
                                                         size_t end) const {
-  std::vector<MetaTensor> result;
+  std::vector<MetaTensor*> result;
  result.reserve(end - start);

  for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*inputs_.at(i));
+    result.push_back(inputs_.at(i).get());
  }

  return result;
@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
  return outputs_.at(idx).get();
 }

-std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start,
+std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
                                                                size_t end) {
-  std::vector<MetaTensor> result;
+  std::vector<MetaTensor*> result;
  result.reserve(end - start);
  for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*outputs_.at(i));
+    result.emplace_back(outputs_.at(i).get());
  }
  return result;
 }

--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -50,13 +50,13 @@ class InferMetaContext {
  const std::pair<int, int>& OutputRangeAt(size_t idx) const;

  const MetaConfig& GetMetaConfig() const;
-  const MetaTensor& InputAt(size_t idx) const;

+  const MetaTensor& InputAt(size_t idx) const;
  paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
+  std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;

-  std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
  MetaTensor* MutableOutputAt(size_t idx);
-  std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
+  std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);

  template <typename AttrType>
  AttrType AttrAt(size_t idx) {
@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
  };

  template <typename... Tail>
-  struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
+  struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
    static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
      static_assert(attr_idx == 0,
@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
      static_assert(out_idx == 0,
                    "InferMeta's Input should appear before Outputs.");
      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      std::vector<MetaTensor> arg =
+      std::vector<MetaTensor*> arg =
          ctx->InputsBetween(range.first, range.second);
      InferMetaFnCallHelper<
          Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
  };

  template <typename... Tail>
-  struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> {
+  struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
    static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-      std::vector<MetaTensor> tmp =
+      std::vector<MetaTensor*> arg =
          ctx->MutableOutputBetween(range.first, range.second);
-      std::vector<MetaTensor>* arg = &tmp;
      InferMetaFnCallHelper<
          Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
                                                                 pargs...,

--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
      } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
        args_def->AppendInput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-#endif
      } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
        args_def->AppendOutput(default_key.backend(),
                               default_tensor_layout,
@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
      } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
        args_def->AppendOutput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#endif
      } else {
        // Attribute deal with
        // TODO(chenweihang): now here allow any types of attribute, maybe

--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -23,9 +23,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
-#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -223,9 +221,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
-#endif

  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -260,9 +256,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {

  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
-#endif

  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);

--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/utils/any.h"
 #include "paddle/utils/optional.h"

-// Note: mixed_vector include many header now, LoD will be
-// used on CUDA device? Can we use small_vector here?
-// @zhanlve: Rollback to original LoD for now
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-#include "paddle/fluid/framework/mixed_vector.h"
-#endif
-
 namespace phi {

 using DDim = phi::DDim;

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input,
  out->share_lod(input);
 }

+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Inputs(X) shape of IndexSample op should be 2-D, but "
+                        "got X's shape = [%s], please check X shape.",
+                        input_dims));
+
+  auto index_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      2,
+      errors::InvalidArgument(
+          "Inputs(Index) shape of IndexSample op should be 2-D, but "
+          "got Index's shape [%s] , please check index shape.",
+          input_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      index_dims[0],
+                      errors::InvalidArgument(
+                          "Inputs(X)'s value of dimension 0 must same with "
+                          "Inputs(Index)'s value of dimension 0, but "
+                          "got %d of Inputs(X), and got %d of Inputs(Index), "
+                          "please check Inputs shape.",
+                          input_dims[0],
+                          index_dims[0]));
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(index_dims);
+  out->share_lod(y);
+}
 void CrossInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    int axis,
@@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x,
 }

 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  auto in_dims = x.dims();
-  out->set_dims(in_dims);
+  out->share_meta(x);
 }

 void BCELossInferMeta(const MetaTensor& input,

--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                        MetaTensor* residual,
                        MetaConfig config = MetaConfig());

+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config = MetaConfig());
+
 void CrossInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    int axis,

--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
  out->set_dtype(x.dtype());
 }

-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                     const Scalar& axis_scalar,
                     MetaTensor* out,
                     MetaConfig config) {
@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
                    phi::errors::InvalidArgument(
                        "The size of input meta vector should be greater"
                        "than 0."));
+  if (axis_scalar.FromTensor()) {
+    auto out_dims =
+        phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
+    out->set_dims(out_dims);
+    out->set_dtype(x.at(0)->dtype());
+    out->set_layout(x.at(0)->layout());
+    out->share_lod(*x.at(0));
+    return;
+  }

  int axis = axis_scalar.to<int>();
  // 1. calculate axis
-  int rank = x.at(0).dims().size();
+  int rank = x.at(0)->dims().size();
  PADDLE_ENFORCE_EQ(
      axis >= -rank && axis < rank,
      true,
@@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,

  // 2. calculate out dims
  std::vector<phi::DDim> x_dims;
-  for (auto& x_t : x) {
-    x_dims.push_back(x_t.dims());
+  x_dims.reserve(x.size());
+  for (const auto* x_t : x) {
+    x_dims.emplace_back(x_t->dims());
  }
  phi::DDim out_dim =
      phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);

  out->set_dims(out_dim);
-  out->set_dtype(x.at(0).dtype());
-  out->set_layout(x.at(0).layout());
+  out->set_dtype(x.at(0)->dtype());
+  out->set_layout(x.at(0)->layout());
+  out->share_lod(*x.at(0));
+}
+
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out) {
+  auto cond_dims = condition.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      cond_dims,
+      x_dims,
+      phi::errors::InvalidArgument(
+          "The dims of Inputs(Condition) and Inputs(X) should be same. "
+          "But received Condition's shape is [%s], X's shape is [%s]",
+          cond_dims,
+          x_dims));
+  PADDLE_ENFORCE_EQ(x_dims,
+                    y_dims,
+                    phi::errors::InvalidArgument(
+                        "The dims of Inputs(X) and Inputs(Y) should be same. "
+                        "But received X's shape is [%s], Y's shape is [%s]",
+                        x_dims,
+                        y_dims));
+  out->share_meta(x);
 }

 }  // namespace phi
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                    MetaTensor* out,
                                    MetaConfig config = MetaConfig());

-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                     const Scalar& axis_scalar,
                     MetaTensor* out,
                     MetaConfig config = MetaConfig());

+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out);
 }  // namespace phi
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x,
                    const ScalarArray& num_or_sections,
                    const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                    MetaConfig config) {
+  if (!config.is_runtime) {
+    if (axis.FromTensor() || num_or_sections.FromTensor()) {
+      auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
+      for (auto* item : out) {
+        item->set_dims(out_dims);
+        item->share_lod(x);
+      }
+      return;
+    }
+  }
+
  int axis_value = axis.to<int>();
  int rank = x.dims().size();
  PADDLE_ENFORCE_EQ(
@@ -475,15 +486,16 @@ void SplitInferMeta(const MetaTensor& x,
    axis_value = axis_value + rank;
  }

+  std::vector<phi::DDim> out_dims(out.size(), x.dims());
+
  auto input_axis_dim = x.dims().at(axis_value);
  auto num_or_sections_data = num_or_sections.GetData();
-  // step1: get formated sections
-  std::vector<int64_t> sections;
  // num_or_sections is a number
  if (num_or_sections_data.size() == 1) {
+    if (config.is_runtime || input_axis_dim > 0) {
      int num = num_or_sections_data.at(0);
-
-    PADDLE_ENFORCE_EQ(input_axis_dim % num,
+      PADDLE_ENFORCE_EQ(
+          input_axis_dim % num,
          0,
          phi::errors::InvalidArgument(
              "The input's size along the split dimension "
@@ -494,8 +506,14 @@ void SplitInferMeta(const MetaTensor& x,
              x.dims(),
              axis_value));

-    for (int i = 0; i < num; ++i) {
-      sections.push_back(input_axis_dim / num);
+      size_t out_axis_dim = input_axis_dim / num;
+      for (auto& out_dim : out_dims) {
+        out_dim[axis_value] = out_axis_dim;
+      }
+    } else {
+      for (auto& out_dim : out_dims) {
+        out_dim[axis_value] = -1;
+      }
    }
  } else {
    // num_or_sections is a sections
@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x,
    int unknow_dim_idx = -1;
    int num_of_unknow = 0;
    int sum_of_section = 0;
+    std::vector<int64_t> sections = num_or_sections_data;

    for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
-      sections.push_back(num_or_sections_data[i]);
-
      if (num_or_sections_data[i] == unknow_dim_val) {
        num_of_unknow++;
        unknow_dim_idx = i;
@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x,
              x.dims(),
              axis_value));
    }
-  }
-
-  // setp2: fill out dims
-  std::vector<phi::DDim> out_dims(sections.size(), x.dims());
-  if (config.is_runtime || input_axis_dim > 0) {
-    for (size_t i = 0; i < sections.size(); ++i) {
+    for (size_t i = 0; i < out_dims.size(); ++i) {
      out_dims[i][axis_value] = sections[i];
    }
-  } else {
-    for (size_t i = 0; i < sections.size(); ++i) {
-      out_dims[i][axis_value] = -1;
-    }
  }

-  for (size_t i = 0; i < sections.size(); ++i) {
+  for (size_t i = 0; i < out.size(); ++i) {
    if (axis_value != 0) {
      // Only pass LoD when not spliting along the first dim.
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
+      out.at(i)->set_dtype(x.dtype());
+      out.at(i)->set_dims(out_dims[i]);
+      out.at(i)->set_layout(x.layout());
    } else {
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
-      (*out)[i].share_lod(x);
+      out.at(i)->set_dtype(x.dtype());
+      out.at(i)->set_dims(out_dims[i]);
+      out.at(i)->set_layout(x.layout());
+      out.at(i)->share_lod(x);
    }
  }
 }

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x_meta,
                    const ScalarArray& num_or_sections,
                    const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                    MetaConfig config = MetaConfig());

 void UnbindInferMeta(const MetaTensor& x,

--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx,
                   const std::vector<DenseTensor>& x,
                   const Scalar& axis) {
  std::vector<MetaTensor> meta_x;
+  meta_x.reserve(x.size());
+  std::vector<MetaTensor*> meta_x_ptr;
  for (const auto& t : x) {
    meta_x.emplace_back(t);
+    meta_x_ptr.push_back(&meta_x.back());
  }

  auto dense_out = phi::Empty<T, Context>(dev_ctx);
  MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
  ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
  return dense_out;
 }

--- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"

 PD_REGISTER_KERNEL(atan2_grad,
                   CPU,

--- a/paddle/phi/kernels/cpu/atan2_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"

 PD_REGISTER_KERNEL(atan2,
                   CPU,

--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx,
  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());

  std::vector<phi::DDim> x_dims;
+  x_dims.reserve(x.size());
  for (size_t i = 0; i < x.size(); ++i) {
    x_dims.push_back(x[i].dims());
  }
@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx,
    }
  } else {
    std::vector<phi::DenseTensor> inputs;
+    inputs.reserve(x.size());
    for (size_t j = 0; j < x.size(); ++j) {
      if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+        inputs.emplace_back(x[j]);
      } else {
        continue;
      }

--- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
--- a/paddle/phi/kernels/cpu/logical_kernel.cc
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                         \
+  template <typename T, typename Context>                          \
+  void Logical##type##Kernel(const Context& dev_ctx,               \
+                             const DenseTensor& x,                 \
+                             const DenseTensor& y,                 \
+                             DenseTensor* out) {                   \
+    funcs::Logical##type##Functor<T> binary_func;                  \
+    ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
+        dev_ctx, x, y, -1, binary_func, out);                      \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  auto* out_ptr = dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                           \
+                     CPU,                                   \
+                     ALL_LAYOUT,                            \
+                     phi::Logical##func_type##Kernel,       \
+                     float,                                 \
+                     double,                                \
+                     bool,                                  \
+                     int64_t,                               \
+                     int,                                   \
+                     int8_t,                                \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx,
                 const ScalarArray& num_or_sections,
                 const Scalar& axis_scalar,
                 std::vector<DenseTensor*> outs) {
-  // need to infershape output
-  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
-    std::vector<MetaTensor> out_metas;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      out_metas.push_back(outs[i]);
-    }
-
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
-
-    for (size_t i = 0; i < out_metas.size(); ++i) {
-      outs[i]->Resize(out_metas[i].dims());
-    }
-  }
-
  std::vector<const DenseTensor*> shape_refer;
  for (size_t j = 0; j < outs.size(); ++j) {
    dev_ctx.template Alloc<T>(outs[j]);

--- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
--- a/paddle/phi/kernels/cpu/where_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/where_grad_kernel.cc
--- a/paddle/phi/kernels/cpu/where_kernel.cc
+++ b/paddle/phi/kernels/cpu/where_kernel.cc
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
--- a/paddle/phi/kernels/funcs/index_impl.cu.h
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
--- a/paddle/phi/kernels/funcs/logical_functor.h
+++ b/paddle/phi/kernels/funcs/logical_functor.h
--- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/atan2_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
--- a/paddle/phi/kernels/gpu/logical_kernel.cu
+++ b/paddle/phi/kernels/gpu/logical_kernel.cu
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
--- a/paddle/phi/kernels/gpu/where_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
--- a/paddle/phi/kernels/gpu/where_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
--- a/paddle/phi/kernels/index_sample_grad_kernel.h
+++ b/paddle/phi/kernels/index_sample_grad_kernel.h
--- a/paddle/phi/kernels/index_sample_kernel.h
+++ b/paddle/phi/kernels/index_sample_kernel.h
--- a/paddle/phi/kernels/logical_kernel.h
+++ b/paddle/phi/kernels/logical_kernel.h
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
--- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
--- a/paddle/phi/kernels/uniform_random_kernel.h
+++ b/paddle/phi/kernels/uniform_random_kernel.h
--- a/paddle/phi/kernels/where_grad_kernel.h
+++ b/paddle/phi/kernels/where_grad_kernel.h
--- a/paddle/phi/kernels/where_kernel.h
+++ b/paddle/phi/kernels/where_kernel.h
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
--- a/paddle/phi/ops/compat/index_sample_sig.cc
+++ b/paddle/phi/ops/compat/index_sample_sig.cc
--- a/paddle/phi/ops/compat/uniform_random_sig.cc
+++ b/paddle/phi/ops/compat/uniform_random_sig.cc
--- a/paddle/phi/ops/compat/where_grad_sig.cc
+++ b/paddle/phi/ops/compat/where_grad_sig.cc
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_group.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py